github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ruler/base/ruler.go (about)

     1  package base
     2  
     3  import (
     4  	"context"
     5  	"flag"
     6  	"fmt"
     7  	"hash/fnv"
     8  	"net/http"
     9  	"net/url"
    10  	"path/filepath"
    11  	"strings"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/go-kit/log"
    16  	"github.com/go-kit/log/level"
    17  	"github.com/grafana/dskit/concurrency"
    18  	"github.com/grafana/dskit/flagext"
    19  	"github.com/grafana/dskit/grpcclient"
    20  	"github.com/grafana/dskit/kv"
    21  	"github.com/grafana/dskit/ring"
    22  	"github.com/grafana/dskit/services"
    23  	"github.com/pkg/errors"
    24  	"github.com/prometheus/client_golang/prometheus"
    25  	"github.com/prometheus/client_golang/prometheus/promauto"
    26  	"github.com/prometheus/prometheus/model/labels"
    27  	"github.com/prometheus/prometheus/model/relabel"
    28  	"github.com/prometheus/prometheus/model/rulefmt"
    29  	"github.com/prometheus/prometheus/notifier"
    30  	promRules "github.com/prometheus/prometheus/rules"
    31  	"github.com/prometheus/prometheus/util/strutil"
    32  	"github.com/weaveworks/common/user"
    33  	"golang.org/x/sync/errgroup"
    34  
    35  	"github.com/grafana/dskit/tenant"
    36  
    37  	"github.com/grafana/loki/pkg/logproto"
    38  	"github.com/grafana/loki/pkg/ruler/rulespb"
    39  	"github.com/grafana/loki/pkg/ruler/rulestore"
    40  	"github.com/grafana/loki/pkg/util"
    41  	util_log "github.com/grafana/loki/pkg/util/log"
    42  	"github.com/grafana/loki/pkg/util/validation"
    43  )
    44  
    45  var (
    46  	supportedShardingStrategies = []string{util.ShardingStrategyDefault, util.ShardingStrategyShuffle}
    47  
    48  	// Validation errors.
    49  	errInvalidShardingStrategy = errors.New("invalid sharding strategy")
    50  	errInvalidTenantShardSize  = errors.New("invalid tenant shard size, the value must be greater than 0")
    51  )
    52  
    53  const (
    54  	// ringKey is the key under which we store the rulers ring in the KVStore.
    55  	ringKey = "rulers"
    56  
    57  	// Number of concurrent group list and group loads operations.
    58  	loadRulesConcurrency  = 10
    59  	fetchRulesConcurrency = 16
    60  
    61  	rulerSyncReasonInitial    = "initial"
    62  	rulerSyncReasonPeriodic   = "periodic"
    63  	rulerSyncReasonRingChange = "ring-change"
    64  
    65  	// Limit errors
    66  	errMaxRuleGroupsPerUserLimitExceeded        = "per-user rule groups limit (limit: %d actual: %d) exceeded"
    67  	errMaxRulesPerRuleGroupPerUserLimitExceeded = "per-user rules per rule group limit (limit: %d actual: %d) exceeded"
    68  
    69  	// errors
    70  	errListAllUser = "unable to list the ruler users"
    71  )
    72  
    73  // Config is the configuration for the recording rules server.
    74  type Config struct {
    75  	// This is used for template expansion in alerts; must be a valid URL.
    76  	ExternalURL flagext.URLValue `yaml:"external_url"`
    77  	// Labels to add to all alerts
    78  	ExternalLabels labels.Labels `yaml:"external_labels,omitempty"`
    79  	// GRPC Client configuration.
    80  	ClientTLSConfig grpcclient.Config `yaml:"ruler_client"`
    81  	// How frequently to evaluate rules by default.
    82  	EvaluationInterval time.Duration `yaml:"evaluation_interval"`
    83  	// How frequently to poll for updated rules.
    84  	PollInterval time.Duration `yaml:"poll_interval"`
    85  	// Rule Storage and Polling configuration.
    86  	StoreConfig RuleStoreConfig `yaml:"storage" doc:"description=Deprecated. Use -ruler-storage.* CLI flags and their respective YAML config options instead."`
    87  	// Path to store rule files for prom manager.
    88  	RulePath string `yaml:"rule_path"`
    89  
    90  	// URL of the Alertmanager to send notifications to.
    91  	AlertmanagerURL string `yaml:"alertmanager_url"`
    92  	// Whether to use DNS SRV records to discover Alertmanager.
    93  	AlertmanagerDiscovery bool `yaml:"enable_alertmanager_discovery"`
    94  	// How long to wait between refreshing the list of Alertmanager based on DNS service discovery.
    95  	AlertmanagerRefreshInterval time.Duration `yaml:"alertmanager_refresh_interval"`
    96  	// Enables the ruler notifier to use the Alertmananger V2 API.
    97  	AlertmanangerEnableV2API bool `yaml:"enable_alertmanager_v2"`
    98  	// Configuration for alert relabeling.
    99  	AlertRelabelConfigs []*relabel.Config `yaml:"alert_relabel_configs,omitempty"`
   100  	// Capacity of the queue for notifications to be sent to the Alertmanager.
   101  	NotificationQueueCapacity int `yaml:"notification_queue_capacity"`
   102  	// HTTP timeout duration when sending notifications to the Alertmanager.
   103  	NotificationTimeout time.Duration `yaml:"notification_timeout"`
   104  	// Client configs for interacting with the Alertmanager
   105  	Notifier NotifierConfig `yaml:"alertmanager_client"`
   106  
   107  	// Max time to tolerate outage for restoring "for" state of alert.
   108  	OutageTolerance time.Duration `yaml:"for_outage_tolerance"`
   109  	// Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period.
   110  	ForGracePeriod time.Duration `yaml:"for_grace_period"`
   111  	// Minimum amount of time to wait before resending an alert to Alertmanager.
   112  	ResendDelay time.Duration `yaml:"resend_delay"`
   113  
   114  	// Enable sharding rule groups.
   115  	EnableSharding   bool          `yaml:"enable_sharding"`
   116  	ShardingStrategy string        `yaml:"sharding_strategy"`
   117  	SearchPendingFor time.Duration `yaml:"search_pending_for"`
   118  	Ring             RingConfig    `yaml:"ring"`
   119  	FlushCheckPeriod time.Duration `yaml:"flush_period"`
   120  
   121  	EnableAPI bool `yaml:"enable_api"`
   122  
   123  	EnabledTenants  flagext.StringSliceCSV `yaml:"enabled_tenants"`
   124  	DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"`
   125  
   126  	RingCheckPeriod time.Duration `yaml:"-"`
   127  
   128  	EnableQueryStats      bool `yaml:"query_stats_enabled"`
   129  	DisableRuleGroupLabel bool `yaml:"disable_rule_group_label"`
   130  }
   131  
   132  // Validate config and returns error on failure
   133  func (cfg *Config) Validate(limits validation.Limits, log log.Logger) error {
   134  	if !util.StringsContain(supportedShardingStrategies, cfg.ShardingStrategy) {
   135  		return errInvalidShardingStrategy
   136  	}
   137  
   138  	if cfg.ShardingStrategy == util.ShardingStrategyShuffle && limits.RulerTenantShardSize <= 0 {
   139  		return errInvalidTenantShardSize
   140  	}
   141  
   142  	if err := cfg.StoreConfig.Validate(); err != nil {
   143  		return errors.Wrap(err, "invalid storage config")
   144  	}
   145  	if err := cfg.ClientTLSConfig.Validate(log); err != nil {
   146  		return errors.Wrap(err, "invalid ruler gRPC client config")
   147  	}
   148  	return nil
   149  }
   150  
   151  // RegisterFlags adds the flags required to config this to the given FlagSet
   152  func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
   153  	cfg.ClientTLSConfig.RegisterFlagsWithPrefix("ruler.client", f)
   154  	cfg.StoreConfig.RegisterFlags(f)
   155  	cfg.Ring.RegisterFlags(f)
   156  	cfg.Notifier.RegisterFlags(f)
   157  
   158  	// Deprecated Flags that will be maintained to avoid user disruption
   159  
   160  	//lint:ignore faillint Need to pass the global logger like this for warning on deprecated methods
   161  	flagext.DeprecatedFlag(f, "ruler.client-timeout", "This flag has been renamed to ruler.configs.client-timeout", util_log.Logger)
   162  	//lint:ignore faillint Need to pass the global logger like this for warning on deprecated methods
   163  	flagext.DeprecatedFlag(f, "ruler.group-timeout", "This flag is no longer functional.", util_log.Logger)
   164  	//lint:ignore faillint Need to pass the global logger like this for warning on deprecated methods
   165  	flagext.DeprecatedFlag(f, "ruler.num-workers", "This flag is no longer functional. For increased concurrency horizontal sharding is recommended", util_log.Logger)
   166  
   167  	cfg.ExternalURL.URL, _ = url.Parse("") // Must be non-nil
   168  	f.Var(&cfg.ExternalURL, "ruler.external.url", "URL of alerts return path.")
   169  	f.DurationVar(&cfg.EvaluationInterval, "ruler.evaluation-interval", 1*time.Minute, "How frequently to evaluate rules")
   170  	f.DurationVar(&cfg.PollInterval, "ruler.poll-interval", 1*time.Minute, "How frequently to poll for rule changes")
   171  
   172  	f.StringVar(&cfg.AlertmanagerURL, "ruler.alertmanager-url", "", "Comma-separated list of URL(s) of the Alertmanager(s) to send notifications to. Each Alertmanager URL is treated as a separate group in the configuration. Multiple Alertmanagers in HA per group can be supported by using DNS resolution via -ruler.alertmanager-discovery.")
   173  	f.BoolVar(&cfg.AlertmanagerDiscovery, "ruler.alertmanager-discovery", false, "Use DNS SRV records to discover Alertmanager hosts.")
   174  	f.DurationVar(&cfg.AlertmanagerRefreshInterval, "ruler.alertmanager-refresh-interval", 1*time.Minute, "How long to wait between refreshing DNS resolutions of Alertmanager hosts.")
   175  	f.BoolVar(&cfg.AlertmanangerEnableV2API, "ruler.alertmanager-use-v2", false, "If enabled requests to Alertmanager will utilize the V2 API.")
   176  	f.IntVar(&cfg.NotificationQueueCapacity, "ruler.notification-queue-capacity", 10000, "Capacity of the queue for notifications to be sent to the Alertmanager.")
   177  	f.DurationVar(&cfg.NotificationTimeout, "ruler.notification-timeout", 10*time.Second, "HTTP timeout duration when sending notifications to the Alertmanager.")
   178  
   179  	f.DurationVar(&cfg.SearchPendingFor, "ruler.search-pending-for", 5*time.Minute, "Time to spend searching for a pending ruler when shutting down.")
   180  	f.BoolVar(&cfg.EnableSharding, "ruler.enable-sharding", false, "Distribute rule evaluation using ring backend")
   181  	f.StringVar(&cfg.ShardingStrategy, "ruler.sharding-strategy", util.ShardingStrategyDefault, fmt.Sprintf("The sharding strategy to use. Supported values are: %s.", strings.Join(supportedShardingStrategies, ", ")))
   182  	f.DurationVar(&cfg.FlushCheckPeriod, "ruler.flush-period", 1*time.Minute, "Period with which to attempt to flush rule groups.")
   183  	f.StringVar(&cfg.RulePath, "ruler.rule-path", "/rules", "file path to store temporary rule files for the prometheus rule managers")
   184  	f.BoolVar(&cfg.EnableAPI, "experimental.ruler.enable-api", false, "Enable the ruler api")
   185  	f.DurationVar(&cfg.OutageTolerance, "ruler.for-outage-tolerance", time.Hour, `Max time to tolerate outage for restoring "for" state of alert.`)
   186  	f.DurationVar(&cfg.ForGracePeriod, "ruler.for-grace-period", 10*time.Minute, `Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period.`)
   187  	f.DurationVar(&cfg.ResendDelay, "ruler.resend-delay", time.Minute, `Minimum amount of time to wait before resending an alert to Alertmanager.`)
   188  
   189  	f.Var(&cfg.EnabledTenants, "ruler.enabled-tenants", "Comma separated list of tenants whose rules this ruler can evaluate. If specified, only these tenants will be handled by ruler, otherwise this ruler can process rules from all tenants. Subject to sharding.")
   190  	f.Var(&cfg.DisabledTenants, "ruler.disabled-tenants", "Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding.")
   191  
   192  	f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report the wall time for ruler queries to complete as a per user metric and as an info level log message.")
   193  	f.BoolVar(&cfg.DisableRuleGroupLabel, "ruler.disable-rule-group-label", false, "Disable the rule_group label on exported metrics")
   194  
   195  	cfg.RingCheckPeriod = 5 * time.Second
   196  }
   197  
   198  // MultiTenantManager is the interface of interaction with a Manager that is tenant aware.
   199  type MultiTenantManager interface {
   200  	// SyncRuleGroups is used to sync the Manager with rules from the RuleStore.
   201  	// If existing user is missing in the ruleGroups map, its ruler manager will be stopped.
   202  	SyncRuleGroups(ctx context.Context, ruleGroups map[string]rulespb.RuleGroupList)
   203  	// GetRules fetches rules for a particular tenant (userID).
   204  	GetRules(userID string) []*promRules.Group
   205  	// Stop stops all Manager components.
   206  	Stop()
   207  	// ValidateRuleGroup validates a rulegroup
   208  	ValidateRuleGroup(rulefmt.RuleGroup) []error
   209  }
   210  
   211  // Ruler evaluates rules.
   212  //	+---------------------------------------------------------------+
   213  //	|                                                               |
   214  //	|                   Query       +-------------+                 |
   215  //	|            +------------------>             |                 |
   216  //	|            |                  |    Store    |                 |
   217  //	|            | +----------------+             |                 |
   218  //	|            | |     Rules      +-------------+                 |
   219  //	|            | |                                                |
   220  //	|            | |                                                |
   221  //	|            | |                                                |
   222  //	|       +----+-v----+   Filter  +------------+                  |
   223  //	|       |           +----------->            |                  |
   224  //	|       |   Ruler   |           |    Ring    |                  |
   225  //	|       |           <-----------+            |                  |
   226  //	|       +-------+---+   Rules   +------------+                  |
   227  //	|               |                                               |
   228  //	|               |                                               |
   229  //	|               |                                               |
   230  //	|               |    Load      +-----------------+              |
   231  //	|               +-------------->                 |              |
   232  //	|                              |     Manager     |              |
   233  //	|                              |                 |              |
   234  //	|                              +-----------------+              |
   235  //	|                                                               |
   236  //	+---------------------------------------------------------------+
   237  type Ruler struct {
   238  	services.Service
   239  
   240  	cfg        Config
   241  	lifecycler *ring.BasicLifecycler
   242  	ring       *ring.Ring
   243  	store      rulestore.RuleStore
   244  	manager    MultiTenantManager
   245  	limits     RulesLimits
   246  
   247  	subservices        *services.Manager
   248  	subservicesWatcher *services.FailureWatcher
   249  
   250  	// Pool of clients used to connect to other ruler replicas.
   251  	clientsPool ClientsPool
   252  
   253  	ringCheckErrors prometheus.Counter
   254  	rulerSync       *prometheus.CounterVec
   255  
   256  	allowedTenants *util.AllowedTenants
   257  
   258  	registry prometheus.Registerer
   259  	logger   log.Logger
   260  }
   261  
   262  // NewRuler creates a new ruler from a distributor and chunk store.
   263  func NewRuler(cfg Config, manager MultiTenantManager, reg prometheus.Registerer, logger log.Logger, ruleStore rulestore.RuleStore, limits RulesLimits) (*Ruler, error) {
   264  	return newRuler(cfg, manager, reg, logger, ruleStore, limits, newRulerClientPool(cfg.ClientTLSConfig, logger, reg))
   265  }
   266  
   267  func newRuler(cfg Config, manager MultiTenantManager, reg prometheus.Registerer, logger log.Logger, ruleStore rulestore.RuleStore, limits RulesLimits, clientPool ClientsPool) (*Ruler, error) {
   268  	ruler := &Ruler{
   269  		cfg:            cfg,
   270  		store:          ruleStore,
   271  		manager:        manager,
   272  		registry:       reg,
   273  		logger:         logger,
   274  		limits:         limits,
   275  		clientsPool:    clientPool,
   276  		allowedTenants: util.NewAllowedTenants(cfg.EnabledTenants, cfg.DisabledTenants),
   277  
   278  		ringCheckErrors: promauto.With(reg).NewCounter(prometheus.CounterOpts{
   279  			Name: "cortex_ruler_ring_check_errors_total",
   280  			Help: "Number of errors that have occurred when checking the ring for ownership",
   281  		}),
   282  
   283  		rulerSync: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   284  			Name: "cortex_ruler_sync_rules_total",
   285  			Help: "Total number of times the ruler sync operation triggered.",
   286  		}, []string{"reason"}),
   287  	}
   288  
   289  	if len(cfg.EnabledTenants) > 0 {
   290  		level.Info(ruler.logger).Log("msg", "ruler using enabled users", "enabled", strings.Join(cfg.EnabledTenants, ", "))
   291  	}
   292  	if len(cfg.DisabledTenants) > 0 {
   293  		level.Info(ruler.logger).Log("msg", "ruler using disabled users", "disabled", strings.Join(cfg.DisabledTenants, ", "))
   294  	}
   295  
   296  	if cfg.EnableSharding {
   297  		ringStore, err := kv.NewClient(
   298  			cfg.Ring.KVStore,
   299  			ring.GetCodec(),
   300  			kv.RegistererWithKVName(prometheus.WrapRegistererWithPrefix("cortex_", reg), "ruler"),
   301  			logger,
   302  		)
   303  		if err != nil {
   304  			return nil, errors.Wrap(err, "create KV store client")
   305  		}
   306  
   307  		if err = enableSharding(ruler, ringStore); err != nil {
   308  			return nil, errors.Wrap(err, "setup ruler sharding ring")
   309  		}
   310  	}
   311  
   312  	ruler.Service = services.NewBasicService(ruler.starting, ruler.run, ruler.stopping)
   313  	return ruler, nil
   314  }
   315  
   316  func enableSharding(r *Ruler, ringStore kv.Client) error {
   317  	lifecyclerCfg, err := r.cfg.Ring.ToLifecyclerConfig(r.logger)
   318  	if err != nil {
   319  		return errors.Wrap(err, "failed to initialize ruler's lifecycler config")
   320  	}
   321  
   322  	// Define lifecycler delegates in reverse order (last to be called defined first because they're
   323  	// chained via "next delegate").
   324  	delegate := ring.BasicLifecyclerDelegate(r)
   325  	delegate = ring.NewLeaveOnStoppingDelegate(delegate, r.logger)
   326  	delegate = ring.NewAutoForgetDelegate(r.cfg.Ring.HeartbeatTimeout*ringAutoForgetUnhealthyPeriods, delegate, r.logger)
   327  
   328  	rulerRingName := "ruler"
   329  	r.lifecycler, err = ring.NewBasicLifecycler(lifecyclerCfg, rulerRingName, ringKey, ringStore, delegate, r.logger, prometheus.WrapRegistererWithPrefix("cortex_", r.registry))
   330  	if err != nil {
   331  		return errors.Wrap(err, "failed to initialize ruler's lifecycler")
   332  	}
   333  
   334  	r.ring, err = ring.NewWithStoreClientAndStrategy(r.cfg.Ring.ToRingConfig(), rulerRingName, ringKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), prometheus.WrapRegistererWithPrefix("cortex_", r.registry), r.logger)
   335  	if err != nil {
   336  		return errors.Wrap(err, "failed to initialize ruler's ring")
   337  	}
   338  
   339  	return nil
   340  }
   341  
   342  func (r *Ruler) starting(ctx context.Context) error {
   343  	// If sharding is enabled, start the used subservices.
   344  	if r.cfg.EnableSharding {
   345  		var err error
   346  
   347  		if r.subservices, err = services.NewManager(r.lifecycler, r.ring, r.clientsPool); err != nil {
   348  			return errors.Wrap(err, "unable to start ruler subservices")
   349  		}
   350  
   351  		r.subservicesWatcher = services.NewFailureWatcher()
   352  		r.subservicesWatcher.WatchManager(r.subservices)
   353  
   354  		if err = services.StartManagerAndAwaitHealthy(ctx, r.subservices); err != nil {
   355  			return errors.Wrap(err, "unable to start ruler subservices")
   356  		}
   357  	}
   358  
   359  	// TODO: ideally, ruler would wait until its queryable is finished starting.
   360  	return nil
   361  }
   362  
   363  // Stop stops the Ruler.
   364  // Each function of the ruler is terminated before leaving the ring
   365  func (r *Ruler) stopping(_ error) error {
   366  	r.manager.Stop()
   367  
   368  	if r.subservices != nil {
   369  		_ = services.StopManagerAndAwaitStopped(context.Background(), r.subservices)
   370  	}
   371  	return nil
   372  }
   373  
   374  type sender interface {
   375  	Send(alerts ...*notifier.Alert)
   376  }
   377  
   378  // SendAlerts implements a rules.NotifyFunc for a Notifier.
   379  // It filters any non-firing alerts from the input.
   380  //
   381  // Copied from Prometheus's main.go.
   382  func SendAlerts(n sender, externalURL string) promRules.NotifyFunc {
   383  	return func(ctx context.Context, expr string, alerts ...*promRules.Alert) {
   384  		var res []*notifier.Alert
   385  
   386  		for _, alert := range alerts {
   387  			a := &notifier.Alert{
   388  				StartsAt:     alert.FiredAt,
   389  				Labels:       alert.Labels,
   390  				Annotations:  alert.Annotations,
   391  				GeneratorURL: externalURL + strutil.TableLinkForExpression(expr),
   392  			}
   393  			if !alert.ResolvedAt.IsZero() {
   394  				a.EndsAt = alert.ResolvedAt
   395  			} else {
   396  				a.EndsAt = alert.ValidUntil
   397  			}
   398  			res = append(res, a)
   399  		}
   400  
   401  		if len(alerts) > 0 {
   402  			n.Send(res...)
   403  		}
   404  	}
   405  }
   406  
   407  var sep = []byte("/")
   408  
   409  func tokenForGroup(g *rulespb.RuleGroupDesc) uint32 {
   410  	ringHasher := fnv.New32a()
   411  
   412  	// Hasher never returns err.
   413  	_, _ = ringHasher.Write([]byte(g.User))
   414  	_, _ = ringHasher.Write(sep)
   415  	_, _ = ringHasher.Write([]byte(g.Namespace))
   416  	_, _ = ringHasher.Write(sep)
   417  	_, _ = ringHasher.Write([]byte(g.Name))
   418  
   419  	return ringHasher.Sum32()
   420  }
   421  
   422  func instanceOwnsRuleGroup(r ring.ReadRing, g *rulespb.RuleGroupDesc, instanceAddr string) (bool, error) {
   423  	hash := tokenForGroup(g)
   424  
   425  	rlrs, err := r.Get(hash, RingOp, nil, nil, nil)
   426  	if err != nil {
   427  		return false, errors.Wrap(err, "error reading ring to verify rule group ownership")
   428  	}
   429  
   430  	return rlrs.Instances[0].Addr == instanceAddr, nil
   431  }
   432  
   433  func (r *Ruler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
   434  	if r.cfg.EnableSharding {
   435  		r.ring.ServeHTTP(w, req)
   436  	} else {
   437  		unshardedPage := `
   438  			<!DOCTYPE html>
   439  			<html>
   440  				<head>
   441  					<meta charset="UTF-8">
   442  					<title>Cortex Ruler Status</title>
   443  				</head>
   444  				<body>
   445  					<h1>Cortex Ruler Status</h1>
   446  					<p>Ruler running with shards disabled</p>
   447  				</body>
   448  			</html>`
   449  		util.WriteHTMLResponse(w, unshardedPage)
   450  	}
   451  }
   452  
   453  func (r *Ruler) run(ctx context.Context) error {
   454  	level.Info(r.logger).Log("msg", "ruler up and running")
   455  
   456  	tick := time.NewTicker(r.cfg.PollInterval)
   457  	defer tick.Stop()
   458  
   459  	var ringTickerChan <-chan time.Time
   460  	var ringLastState ring.ReplicationSet
   461  
   462  	if r.cfg.EnableSharding {
   463  		ringLastState, _ = r.ring.GetAllHealthy(RingOp)
   464  		ringTicker := time.NewTicker(util.DurationWithJitter(r.cfg.RingCheckPeriod, 0.2))
   465  		defer ringTicker.Stop()
   466  		ringTickerChan = ringTicker.C
   467  	}
   468  
   469  	r.syncRules(ctx, rulerSyncReasonInitial)
   470  	for {
   471  		select {
   472  		case <-ctx.Done():
   473  			return nil
   474  		case <-tick.C:
   475  			r.syncRules(ctx, rulerSyncReasonPeriodic)
   476  		case <-ringTickerChan:
   477  			// We ignore the error because in case of error it will return an empty
   478  			// replication set which we use to compare with the previous state.
   479  			currRingState, _ := r.ring.GetAllHealthy(RingOp)
   480  
   481  			if ring.HasReplicationSetChanged(ringLastState, currRingState) {
   482  				ringLastState = currRingState
   483  				r.syncRules(ctx, rulerSyncReasonRingChange)
   484  			}
   485  		case err := <-r.subservicesWatcher.Chan():
   486  			return errors.Wrap(err, "ruler subservice failed")
   487  		}
   488  	}
   489  }
   490  
   491  func (r *Ruler) syncRules(ctx context.Context, reason string) {
   492  	level.Debug(r.logger).Log("msg", "syncing rules", "reason", reason)
   493  	r.rulerSync.WithLabelValues(reason).Inc()
   494  
   495  	configs, err := r.listRules(ctx)
   496  	if err != nil {
   497  		level.Error(r.logger).Log("msg", "unable to list rules", "err", err)
   498  		return
   499  	}
   500  
   501  	err = r.store.LoadRuleGroups(ctx, configs)
   502  	if err != nil {
   503  		level.Error(r.logger).Log("msg", "unable to load rules owned by this ruler", "err", err)
   504  		return
   505  	}
   506  
   507  	// This will also delete local group files for users that are no longer in 'configs' map.
   508  	r.manager.SyncRuleGroups(ctx, configs)
   509  }
   510  
   511  func (r *Ruler) listRules(ctx context.Context) (result map[string]rulespb.RuleGroupList, err error) {
   512  	switch {
   513  	case !r.cfg.EnableSharding:
   514  		result, err = r.listRulesNoSharding(ctx)
   515  
   516  	case r.cfg.ShardingStrategy == util.ShardingStrategyDefault:
   517  		result, err = r.listRulesShardingDefault(ctx)
   518  
   519  	case r.cfg.ShardingStrategy == util.ShardingStrategyShuffle:
   520  		result, err = r.listRulesShuffleSharding(ctx)
   521  
   522  	default:
   523  		return nil, errors.New("invalid sharding configuration")
   524  	}
   525  
   526  	if err != nil {
   527  		return
   528  	}
   529  
   530  	for userID := range result {
   531  		if !r.allowedTenants.IsAllowed(userID) {
   532  			level.Debug(r.logger).Log("msg", "ignoring rule groups for user, not allowed", "user", userID)
   533  			delete(result, userID)
   534  		}
   535  	}
   536  	return
   537  }
   538  
   539  func (r *Ruler) listRulesNoSharding(ctx context.Context) (map[string]rulespb.RuleGroupList, error) {
   540  	return r.store.ListAllRuleGroups(ctx)
   541  }
   542  
   543  func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulespb.RuleGroupList, error) {
   544  	configs, err := r.store.ListAllRuleGroups(ctx)
   545  	if err != nil {
   546  		return nil, err
   547  	}
   548  
   549  	filteredConfigs := make(map[string]rulespb.RuleGroupList)
   550  	for userID, groups := range configs {
   551  		filtered := filterRuleGroups(userID, groups, r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
   552  		if len(filtered) > 0 {
   553  			filteredConfigs[userID] = filtered
   554  		}
   555  	}
   556  	return filteredConfigs, nil
   557  }
   558  
   559  func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulespb.RuleGroupList, error) {
   560  	users, err := r.store.ListAllUsers(ctx)
   561  	if err != nil {
   562  		return nil, errors.Wrap(err, "unable to list users of ruler")
   563  	}
   564  
   565  	// Only users in userRings will be used in the to load the rules.
   566  	userRings := map[string]ring.ReadRing{}
   567  	for _, u := range users {
   568  		if shardSize := r.limits.RulerTenantShardSize(u); shardSize > 0 {
   569  			subRing := r.ring.ShuffleShard(u, shardSize)
   570  
   571  			// Include the user only if it belongs to this ruler shard.
   572  			if subRing.HasInstance(r.lifecycler.GetInstanceID()) {
   573  				userRings[u] = subRing
   574  			}
   575  		} else {
   576  			// A shard size of 0 means shuffle sharding is disabled for this specific user.
   577  			// In that case we use the full ring so that rule groups will be sharded across all rulers.
   578  			userRings[u] = r.ring
   579  		}
   580  	}
   581  
   582  	if len(userRings) == 0 {
   583  		return nil, nil
   584  	}
   585  
   586  	userCh := make(chan string, len(userRings))
   587  	for u := range userRings {
   588  		userCh <- u
   589  	}
   590  	close(userCh)
   591  
   592  	mu := sync.Mutex{}
   593  	result := map[string]rulespb.RuleGroupList{}
   594  
   595  	concurrency := loadRulesConcurrency
   596  	if len(userRings) < concurrency {
   597  		concurrency = len(userRings)
   598  	}
   599  
   600  	g, gctx := errgroup.WithContext(ctx)
   601  	for i := 0; i < concurrency; i++ {
   602  		g.Go(func() error {
   603  			for userID := range userCh {
   604  				groups, err := r.store.ListRuleGroupsForUserAndNamespace(gctx, userID, "")
   605  				if err != nil {
   606  					return errors.Wrapf(err, "failed to fetch rule groups for user %s", userID)
   607  				}
   608  
   609  				filtered := filterRuleGroups(userID, groups, userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
   610  				if len(filtered) == 0 {
   611  					continue
   612  				}
   613  
   614  				mu.Lock()
   615  				result[userID] = filtered
   616  				mu.Unlock()
   617  			}
   618  			return nil
   619  		})
   620  	}
   621  
   622  	err = g.Wait()
   623  	return result, err
   624  }
   625  
   626  // filterRuleGroups returns map of rule groups that given instance "owns" based on supplied ring.
   627  // This function only uses User, Namespace, and Name fields of individual RuleGroups.
   628  //
   629  // Reason why this function is not a method on Ruler is to make sure we don't accidentally use r.ring,
   630  // but only ring passed as parameter.
   631  func filterRuleGroups(userID string, ruleGroups []*rulespb.RuleGroupDesc, ring ring.ReadRing, instanceAddr string, log log.Logger, ringCheckErrors prometheus.Counter) []*rulespb.RuleGroupDesc {
   632  	// Prune the rule group to only contain rules that this ruler is responsible for, based on ring.
   633  	var result []*rulespb.RuleGroupDesc
   634  	for _, g := range ruleGroups {
   635  		owned, err := instanceOwnsRuleGroup(ring, g, instanceAddr)
   636  		if err != nil {
   637  			ringCheckErrors.Inc()
   638  			level.Error(log).Log("msg", "failed to check if the ruler replica owns the rule group", "user", userID, "namespace", g.Namespace, "group", g.Name, "err", err)
   639  			continue
   640  		}
   641  
   642  		if owned {
   643  			level.Debug(log).Log("msg", "rule group owned", "user", g.User, "namespace", g.Namespace, "name", g.Name)
   644  			result = append(result, g)
   645  		} else {
   646  			level.Debug(log).Log("msg", "rule group not owned, ignoring", "user", g.User, "namespace", g.Namespace, "name", g.Name)
   647  		}
   648  	}
   649  
   650  	return result
   651  }
   652  
   653  // GetRules retrieves the running rules from this ruler and all running rulers in the ring if
   654  // sharding is enabled
   655  func (r *Ruler) GetRules(ctx context.Context) ([]*GroupStateDesc, error) {
   656  	userID, err := tenant.TenantID(ctx)
   657  	if err != nil {
   658  		return nil, fmt.Errorf("no user id found in context")
   659  	}
   660  
   661  	if r.cfg.EnableSharding {
   662  		return r.getShardedRules(ctx, userID)
   663  	}
   664  
   665  	return r.getLocalRules(userID)
   666  }
   667  
   668  func (r *Ruler) getLocalRules(userID string) ([]*GroupStateDesc, error) {
   669  	groups := r.manager.GetRules(userID)
   670  
   671  	groupDescs := make([]*GroupStateDesc, 0, len(groups))
   672  	prefix := filepath.Join(r.cfg.RulePath, userID) + "/"
   673  
   674  	for _, group := range groups {
   675  		interval := group.Interval()
   676  
   677  		// The mapped filename is url path escaped encoded to make handling `/` characters easier
   678  		decodedNamespace, err := url.PathUnescape(strings.TrimPrefix(group.File(), prefix))
   679  		if err != nil {
   680  			return nil, errors.Wrap(err, "unable to decode rule filename")
   681  		}
   682  
   683  		groupDesc := &GroupStateDesc{
   684  			Group: &rulespb.RuleGroupDesc{
   685  				Name:      group.Name(),
   686  				Namespace: decodedNamespace,
   687  				Interval:  interval,
   688  				User:      userID,
   689  			},
   690  
   691  			EvaluationTimestamp: group.GetLastEvaluation(),
   692  			EvaluationDuration:  group.GetEvaluationTime(),
   693  		}
   694  		for _, r := range group.Rules() {
   695  			lastError := ""
   696  			if r.LastError() != nil {
   697  				lastError = r.LastError().Error()
   698  			}
   699  
   700  			var ruleDesc *RuleStateDesc
   701  			switch rule := r.(type) {
   702  			case *promRules.AlertingRule:
   703  				rule.ActiveAlerts()
   704  				alerts := []*AlertStateDesc{}
   705  				for _, a := range rule.ActiveAlerts() {
   706  					alerts = append(alerts, &AlertStateDesc{
   707  						State:       a.State.String(),
   708  						Labels:      logproto.FromLabelsToLabelAdapters(a.Labels),
   709  						Annotations: logproto.FromLabelsToLabelAdapters(a.Annotations),
   710  						Value:       a.Value,
   711  						ActiveAt:    a.ActiveAt,
   712  						FiredAt:     a.FiredAt,
   713  						ResolvedAt:  a.ResolvedAt,
   714  						LastSentAt:  a.LastSentAt,
   715  						ValidUntil:  a.ValidUntil,
   716  					})
   717  				}
   718  				ruleDesc = &RuleStateDesc{
   719  					Rule: &rulespb.RuleDesc{
   720  						Expr:        rule.Query().String(),
   721  						Alert:       rule.Name(),
   722  						For:         rule.HoldDuration(),
   723  						Labels:      logproto.FromLabelsToLabelAdapters(rule.Labels()),
   724  						Annotations: logproto.FromLabelsToLabelAdapters(rule.Annotations()),
   725  					},
   726  					State:               rule.State().String(),
   727  					Health:              string(rule.Health()),
   728  					LastError:           lastError,
   729  					Alerts:              alerts,
   730  					EvaluationTimestamp: rule.GetEvaluationTimestamp(),
   731  					EvaluationDuration:  rule.GetEvaluationDuration(),
   732  				}
   733  			case *promRules.RecordingRule:
   734  				ruleDesc = &RuleStateDesc{
   735  					Rule: &rulespb.RuleDesc{
   736  						Record: rule.Name(),
   737  						Expr:   rule.Query().String(),
   738  						Labels: logproto.FromLabelsToLabelAdapters(rule.Labels()),
   739  					},
   740  					Health:              string(rule.Health()),
   741  					LastError:           lastError,
   742  					EvaluationTimestamp: rule.GetEvaluationTimestamp(),
   743  					EvaluationDuration:  rule.GetEvaluationDuration(),
   744  				}
   745  			default:
   746  				return nil, errors.Errorf("failed to assert type of rule '%v'", rule.Name())
   747  			}
   748  			groupDesc.ActiveRules = append(groupDesc.ActiveRules, ruleDesc)
   749  		}
   750  		groupDescs = append(groupDescs, groupDesc)
   751  	}
   752  	return groupDescs, nil
   753  }
   754  
   755  func (r *Ruler) getShardedRules(ctx context.Context, userID string) ([]*GroupStateDesc, error) {
   756  	ring := ring.ReadRing(r.ring)
   757  
   758  	if shardSize := r.limits.RulerTenantShardSize(userID); shardSize > 0 && r.cfg.ShardingStrategy == util.ShardingStrategyShuffle {
   759  		ring = r.ring.ShuffleShard(userID, shardSize)
   760  	}
   761  
   762  	rulers, err := ring.GetReplicationSetForOperation(RingOp)
   763  	if err != nil {
   764  		return nil, err
   765  	}
   766  
   767  	ctx, err = user.InjectIntoGRPCRequest(ctx)
   768  	if err != nil {
   769  		return nil, fmt.Errorf("unable to inject user ID into grpc request, %v", err)
   770  	}
   771  
   772  	var (
   773  		mergedMx sync.Mutex
   774  		merged   []*GroupStateDesc
   775  	)
   776  
   777  	// Concurrently fetch rules from all rulers. Since rules are not replicated,
   778  	// we need all requests to succeed.
   779  	addresses := rulers.GetAddresses()
   780  	err = concurrency.ForEachJob(ctx, len(addresses), len(addresses), func(ctx context.Context, idx int) error {
   781  		addr := addresses[idx]
   782  
   783  		rulerClient, err := r.clientsPool.GetClientFor(addr)
   784  		if err != nil {
   785  			return errors.Wrapf(err, "unable to get client for ruler %s", addr)
   786  		}
   787  
   788  		newGrps, err := rulerClient.Rules(ctx, &RulesRequest{})
   789  		if err != nil {
   790  			return errors.Wrapf(err, "unable to retrieve rules from ruler %s", addr)
   791  		}
   792  
   793  		mergedMx.Lock()
   794  		merged = append(merged, newGrps.Groups...)
   795  		mergedMx.Unlock()
   796  
   797  		return nil
   798  	})
   799  
   800  	return merged, err
   801  }
   802  
   803  // Rules implements the rules service
   804  func (r *Ruler) Rules(ctx context.Context, in *RulesRequest) (*RulesResponse, error) {
   805  	userID, err := tenant.TenantID(ctx)
   806  	if err != nil {
   807  		return nil, fmt.Errorf("no user id found in context")
   808  	}
   809  
   810  	groupDescs, err := r.getLocalRules(userID)
   811  	if err != nil {
   812  		return nil, err
   813  	}
   814  
   815  	return &RulesResponse{Groups: groupDescs}, nil
   816  }
   817  
   818  // AssertMaxRuleGroups limit has not been reached compared to the current
   819  // number of total rule groups in input and returns an error if so.
   820  func (r *Ruler) AssertMaxRuleGroups(userID string, rg int) error {
   821  	limit := r.limits.RulerMaxRuleGroupsPerTenant(userID)
   822  
   823  	if limit <= 0 {
   824  		return nil
   825  	}
   826  
   827  	if rg <= limit {
   828  		return nil
   829  	}
   830  
   831  	return fmt.Errorf(errMaxRuleGroupsPerUserLimitExceeded, limit, rg)
   832  }
   833  
   834  // AssertMaxRulesPerRuleGroup limit has not been reached compared to the current
   835  // number of rules in a rule group in input and returns an error if so.
   836  func (r *Ruler) AssertMaxRulesPerRuleGroup(userID string, rules int) error {
   837  	limit := r.limits.RulerMaxRulesPerRuleGroup(userID)
   838  
   839  	if limit <= 0 {
   840  		return nil
   841  	}
   842  
   843  	if rules <= limit {
   844  		return nil
   845  	}
   846  	return fmt.Errorf(errMaxRulesPerRuleGroupPerUserLimitExceeded, limit, rules)
   847  }
   848  
   849  func (r *Ruler) DeleteTenantConfiguration(w http.ResponseWriter, req *http.Request) {
   850  	logger := util_log.WithContext(req.Context(), r.logger)
   851  
   852  	userID, err := tenant.TenantID(req.Context())
   853  	if err != nil {
   854  		// When Cortex is running, it uses Auth Middleware for checking X-Scope-OrgID and injecting tenant into context.
   855  		// Auth Middleware sends http.StatusUnauthorized if X-Scope-OrgID is missing, so we do too here, for consistency.
   856  		http.Error(w, err.Error(), http.StatusUnauthorized)
   857  		return
   858  	}
   859  
   860  	err = r.store.DeleteNamespace(req.Context(), userID, "") // Empty namespace = delete all rule groups.
   861  	if err != nil && !errors.Is(err, rulestore.ErrGroupNamespaceNotFound) {
   862  		respondError(logger, w, err.Error())
   863  		return
   864  	}
   865  
   866  	level.Info(logger).Log("msg", "deleted all tenant rule groups", "user", userID)
   867  	w.WriteHeader(http.StatusOK)
   868  }
   869  
   870  func (r *Ruler) ListAllRules(w http.ResponseWriter, req *http.Request) {
   871  	logger := util_log.WithContext(req.Context(), r.logger)
   872  
   873  	userIDs, err := r.store.ListAllUsers(req.Context())
   874  	if err != nil {
   875  		level.Error(logger).Log("msg", errListAllUser, "err", err)
   876  		http.Error(w, fmt.Sprintf("%s: %s", errListAllUser, err.Error()), http.StatusInternalServerError)
   877  		return
   878  	}
   879  
   880  	done := make(chan struct{})
   881  	iter := make(chan interface{})
   882  
   883  	go func() {
   884  		util.StreamWriteYAMLResponse(w, iter, logger)
   885  		close(done)
   886  	}()
   887  
   888  	err = concurrency.ForEachUser(req.Context(), userIDs, fetchRulesConcurrency, func(ctx context.Context, userID string) error {
   889  		rg, err := r.store.ListRuleGroupsForUserAndNamespace(ctx, userID, "")
   890  		if err != nil {
   891  			return errors.Wrapf(err, "failed to fetch ruler config for user %s", userID)
   892  		}
   893  		userRules := map[string]rulespb.RuleGroupList{userID: rg}
   894  		if err := r.store.LoadRuleGroups(ctx, userRules); err != nil {
   895  			return errors.Wrapf(err, "failed to load ruler config for user %s", userID)
   896  		}
   897  		data := map[string]map[string][]rulefmt.RuleGroup{userID: userRules[userID].Formatted()}
   898  
   899  		select {
   900  		case iter <- data:
   901  		case <-done: // stop early, if sending response has already finished
   902  		}
   903  
   904  		return nil
   905  	})
   906  	if err != nil {
   907  		level.Error(logger).Log("msg", "failed to list all ruler configs", "err", err)
   908  	}
   909  	close(iter)
   910  	<-done
   911  }