github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ruler/base/ruler.go (about) 1 package base 2 3 import ( 4 "context" 5 "flag" 6 "fmt" 7 "hash/fnv" 8 "net/http" 9 "net/url" 10 "path/filepath" 11 "strings" 12 "sync" 13 "time" 14 15 "github.com/go-kit/log" 16 "github.com/go-kit/log/level" 17 "github.com/grafana/dskit/concurrency" 18 "github.com/grafana/dskit/flagext" 19 "github.com/grafana/dskit/grpcclient" 20 "github.com/grafana/dskit/kv" 21 "github.com/grafana/dskit/ring" 22 "github.com/grafana/dskit/services" 23 "github.com/pkg/errors" 24 "github.com/prometheus/client_golang/prometheus" 25 "github.com/prometheus/client_golang/prometheus/promauto" 26 "github.com/prometheus/prometheus/model/labels" 27 "github.com/prometheus/prometheus/model/relabel" 28 "github.com/prometheus/prometheus/model/rulefmt" 29 "github.com/prometheus/prometheus/notifier" 30 promRules "github.com/prometheus/prometheus/rules" 31 "github.com/prometheus/prometheus/util/strutil" 32 "github.com/weaveworks/common/user" 33 "golang.org/x/sync/errgroup" 34 35 "github.com/grafana/dskit/tenant" 36 37 "github.com/grafana/loki/pkg/logproto" 38 "github.com/grafana/loki/pkg/ruler/rulespb" 39 "github.com/grafana/loki/pkg/ruler/rulestore" 40 "github.com/grafana/loki/pkg/util" 41 util_log "github.com/grafana/loki/pkg/util/log" 42 "github.com/grafana/loki/pkg/util/validation" 43 ) 44 45 var ( 46 supportedShardingStrategies = []string{util.ShardingStrategyDefault, util.ShardingStrategyShuffle} 47 48 // Validation errors. 49 errInvalidShardingStrategy = errors.New("invalid sharding strategy") 50 errInvalidTenantShardSize = errors.New("invalid tenant shard size, the value must be greater than 0") 51 ) 52 53 const ( 54 // ringKey is the key under which we store the rulers ring in the KVStore. 55 ringKey = "rulers" 56 57 // Number of concurrent group list and group loads operations. 58 loadRulesConcurrency = 10 59 fetchRulesConcurrency = 16 60 61 rulerSyncReasonInitial = "initial" 62 rulerSyncReasonPeriodic = "periodic" 63 rulerSyncReasonRingChange = "ring-change" 64 65 // Limit errors 66 errMaxRuleGroupsPerUserLimitExceeded = "per-user rule groups limit (limit: %d actual: %d) exceeded" 67 errMaxRulesPerRuleGroupPerUserLimitExceeded = "per-user rules per rule group limit (limit: %d actual: %d) exceeded" 68 69 // errors 70 errListAllUser = "unable to list the ruler users" 71 ) 72 73 // Config is the configuration for the recording rules server. 74 type Config struct { 75 // This is used for template expansion in alerts; must be a valid URL. 76 ExternalURL flagext.URLValue `yaml:"external_url"` 77 // Labels to add to all alerts 78 ExternalLabels labels.Labels `yaml:"external_labels,omitempty"` 79 // GRPC Client configuration. 80 ClientTLSConfig grpcclient.Config `yaml:"ruler_client"` 81 // How frequently to evaluate rules by default. 82 EvaluationInterval time.Duration `yaml:"evaluation_interval"` 83 // How frequently to poll for updated rules. 84 PollInterval time.Duration `yaml:"poll_interval"` 85 // Rule Storage and Polling configuration. 86 StoreConfig RuleStoreConfig `yaml:"storage" doc:"description=Deprecated. Use -ruler-storage.* CLI flags and their respective YAML config options instead."` 87 // Path to store rule files for prom manager. 88 RulePath string `yaml:"rule_path"` 89 90 // URL of the Alertmanager to send notifications to. 91 AlertmanagerURL string `yaml:"alertmanager_url"` 92 // Whether to use DNS SRV records to discover Alertmanager. 93 AlertmanagerDiscovery bool `yaml:"enable_alertmanager_discovery"` 94 // How long to wait between refreshing the list of Alertmanager based on DNS service discovery. 95 AlertmanagerRefreshInterval time.Duration `yaml:"alertmanager_refresh_interval"` 96 // Enables the ruler notifier to use the Alertmananger V2 API. 97 AlertmanangerEnableV2API bool `yaml:"enable_alertmanager_v2"` 98 // Configuration for alert relabeling. 99 AlertRelabelConfigs []*relabel.Config `yaml:"alert_relabel_configs,omitempty"` 100 // Capacity of the queue for notifications to be sent to the Alertmanager. 101 NotificationQueueCapacity int `yaml:"notification_queue_capacity"` 102 // HTTP timeout duration when sending notifications to the Alertmanager. 103 NotificationTimeout time.Duration `yaml:"notification_timeout"` 104 // Client configs for interacting with the Alertmanager 105 Notifier NotifierConfig `yaml:"alertmanager_client"` 106 107 // Max time to tolerate outage for restoring "for" state of alert. 108 OutageTolerance time.Duration `yaml:"for_outage_tolerance"` 109 // Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period. 110 ForGracePeriod time.Duration `yaml:"for_grace_period"` 111 // Minimum amount of time to wait before resending an alert to Alertmanager. 112 ResendDelay time.Duration `yaml:"resend_delay"` 113 114 // Enable sharding rule groups. 115 EnableSharding bool `yaml:"enable_sharding"` 116 ShardingStrategy string `yaml:"sharding_strategy"` 117 SearchPendingFor time.Duration `yaml:"search_pending_for"` 118 Ring RingConfig `yaml:"ring"` 119 FlushCheckPeriod time.Duration `yaml:"flush_period"` 120 121 EnableAPI bool `yaml:"enable_api"` 122 123 EnabledTenants flagext.StringSliceCSV `yaml:"enabled_tenants"` 124 DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"` 125 126 RingCheckPeriod time.Duration `yaml:"-"` 127 128 EnableQueryStats bool `yaml:"query_stats_enabled"` 129 DisableRuleGroupLabel bool `yaml:"disable_rule_group_label"` 130 } 131 132 // Validate config and returns error on failure 133 func (cfg *Config) Validate(limits validation.Limits, log log.Logger) error { 134 if !util.StringsContain(supportedShardingStrategies, cfg.ShardingStrategy) { 135 return errInvalidShardingStrategy 136 } 137 138 if cfg.ShardingStrategy == util.ShardingStrategyShuffle && limits.RulerTenantShardSize <= 0 { 139 return errInvalidTenantShardSize 140 } 141 142 if err := cfg.StoreConfig.Validate(); err != nil { 143 return errors.Wrap(err, "invalid storage config") 144 } 145 if err := cfg.ClientTLSConfig.Validate(log); err != nil { 146 return errors.Wrap(err, "invalid ruler gRPC client config") 147 } 148 return nil 149 } 150 151 // RegisterFlags adds the flags required to config this to the given FlagSet 152 func (cfg *Config) RegisterFlags(f *flag.FlagSet) { 153 cfg.ClientTLSConfig.RegisterFlagsWithPrefix("ruler.client", f) 154 cfg.StoreConfig.RegisterFlags(f) 155 cfg.Ring.RegisterFlags(f) 156 cfg.Notifier.RegisterFlags(f) 157 158 // Deprecated Flags that will be maintained to avoid user disruption 159 160 //lint:ignore faillint Need to pass the global logger like this for warning on deprecated methods 161 flagext.DeprecatedFlag(f, "ruler.client-timeout", "This flag has been renamed to ruler.configs.client-timeout", util_log.Logger) 162 //lint:ignore faillint Need to pass the global logger like this for warning on deprecated methods 163 flagext.DeprecatedFlag(f, "ruler.group-timeout", "This flag is no longer functional.", util_log.Logger) 164 //lint:ignore faillint Need to pass the global logger like this for warning on deprecated methods 165 flagext.DeprecatedFlag(f, "ruler.num-workers", "This flag is no longer functional. For increased concurrency horizontal sharding is recommended", util_log.Logger) 166 167 cfg.ExternalURL.URL, _ = url.Parse("") // Must be non-nil 168 f.Var(&cfg.ExternalURL, "ruler.external.url", "URL of alerts return path.") 169 f.DurationVar(&cfg.EvaluationInterval, "ruler.evaluation-interval", 1*time.Minute, "How frequently to evaluate rules") 170 f.DurationVar(&cfg.PollInterval, "ruler.poll-interval", 1*time.Minute, "How frequently to poll for rule changes") 171 172 f.StringVar(&cfg.AlertmanagerURL, "ruler.alertmanager-url", "", "Comma-separated list of URL(s) of the Alertmanager(s) to send notifications to. Each Alertmanager URL is treated as a separate group in the configuration. Multiple Alertmanagers in HA per group can be supported by using DNS resolution via -ruler.alertmanager-discovery.") 173 f.BoolVar(&cfg.AlertmanagerDiscovery, "ruler.alertmanager-discovery", false, "Use DNS SRV records to discover Alertmanager hosts.") 174 f.DurationVar(&cfg.AlertmanagerRefreshInterval, "ruler.alertmanager-refresh-interval", 1*time.Minute, "How long to wait between refreshing DNS resolutions of Alertmanager hosts.") 175 f.BoolVar(&cfg.AlertmanangerEnableV2API, "ruler.alertmanager-use-v2", false, "If enabled requests to Alertmanager will utilize the V2 API.") 176 f.IntVar(&cfg.NotificationQueueCapacity, "ruler.notification-queue-capacity", 10000, "Capacity of the queue for notifications to be sent to the Alertmanager.") 177 f.DurationVar(&cfg.NotificationTimeout, "ruler.notification-timeout", 10*time.Second, "HTTP timeout duration when sending notifications to the Alertmanager.") 178 179 f.DurationVar(&cfg.SearchPendingFor, "ruler.search-pending-for", 5*time.Minute, "Time to spend searching for a pending ruler when shutting down.") 180 f.BoolVar(&cfg.EnableSharding, "ruler.enable-sharding", false, "Distribute rule evaluation using ring backend") 181 f.StringVar(&cfg.ShardingStrategy, "ruler.sharding-strategy", util.ShardingStrategyDefault, fmt.Sprintf("The sharding strategy to use. Supported values are: %s.", strings.Join(supportedShardingStrategies, ", "))) 182 f.DurationVar(&cfg.FlushCheckPeriod, "ruler.flush-period", 1*time.Minute, "Period with which to attempt to flush rule groups.") 183 f.StringVar(&cfg.RulePath, "ruler.rule-path", "/rules", "file path to store temporary rule files for the prometheus rule managers") 184 f.BoolVar(&cfg.EnableAPI, "experimental.ruler.enable-api", false, "Enable the ruler api") 185 f.DurationVar(&cfg.OutageTolerance, "ruler.for-outage-tolerance", time.Hour, `Max time to tolerate outage for restoring "for" state of alert.`) 186 f.DurationVar(&cfg.ForGracePeriod, "ruler.for-grace-period", 10*time.Minute, `Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period.`) 187 f.DurationVar(&cfg.ResendDelay, "ruler.resend-delay", time.Minute, `Minimum amount of time to wait before resending an alert to Alertmanager.`) 188 189 f.Var(&cfg.EnabledTenants, "ruler.enabled-tenants", "Comma separated list of tenants whose rules this ruler can evaluate. If specified, only these tenants will be handled by ruler, otherwise this ruler can process rules from all tenants. Subject to sharding.") 190 f.Var(&cfg.DisabledTenants, "ruler.disabled-tenants", "Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding.") 191 192 f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report the wall time for ruler queries to complete as a per user metric and as an info level log message.") 193 f.BoolVar(&cfg.DisableRuleGroupLabel, "ruler.disable-rule-group-label", false, "Disable the rule_group label on exported metrics") 194 195 cfg.RingCheckPeriod = 5 * time.Second 196 } 197 198 // MultiTenantManager is the interface of interaction with a Manager that is tenant aware. 199 type MultiTenantManager interface { 200 // SyncRuleGroups is used to sync the Manager with rules from the RuleStore. 201 // If existing user is missing in the ruleGroups map, its ruler manager will be stopped. 202 SyncRuleGroups(ctx context.Context, ruleGroups map[string]rulespb.RuleGroupList) 203 // GetRules fetches rules for a particular tenant (userID). 204 GetRules(userID string) []*promRules.Group 205 // Stop stops all Manager components. 206 Stop() 207 // ValidateRuleGroup validates a rulegroup 208 ValidateRuleGroup(rulefmt.RuleGroup) []error 209 } 210 211 // Ruler evaluates rules. 212 // +---------------------------------------------------------------+ 213 // | | 214 // | Query +-------------+ | 215 // | +------------------> | | 216 // | | | Store | | 217 // | | +----------------+ | | 218 // | | | Rules +-------------+ | 219 // | | | | 220 // | | | | 221 // | | | | 222 // | +----+-v----+ Filter +------------+ | 223 // | | +-----------> | | 224 // | | Ruler | | Ring | | 225 // | | <-----------+ | | 226 // | +-------+---+ Rules +------------+ | 227 // | | | 228 // | | | 229 // | | | 230 // | | Load +-----------------+ | 231 // | +--------------> | | 232 // | | Manager | | 233 // | | | | 234 // | +-----------------+ | 235 // | | 236 // +---------------------------------------------------------------+ 237 type Ruler struct { 238 services.Service 239 240 cfg Config 241 lifecycler *ring.BasicLifecycler 242 ring *ring.Ring 243 store rulestore.RuleStore 244 manager MultiTenantManager 245 limits RulesLimits 246 247 subservices *services.Manager 248 subservicesWatcher *services.FailureWatcher 249 250 // Pool of clients used to connect to other ruler replicas. 251 clientsPool ClientsPool 252 253 ringCheckErrors prometheus.Counter 254 rulerSync *prometheus.CounterVec 255 256 allowedTenants *util.AllowedTenants 257 258 registry prometheus.Registerer 259 logger log.Logger 260 } 261 262 // NewRuler creates a new ruler from a distributor and chunk store. 263 func NewRuler(cfg Config, manager MultiTenantManager, reg prometheus.Registerer, logger log.Logger, ruleStore rulestore.RuleStore, limits RulesLimits) (*Ruler, error) { 264 return newRuler(cfg, manager, reg, logger, ruleStore, limits, newRulerClientPool(cfg.ClientTLSConfig, logger, reg)) 265 } 266 267 func newRuler(cfg Config, manager MultiTenantManager, reg prometheus.Registerer, logger log.Logger, ruleStore rulestore.RuleStore, limits RulesLimits, clientPool ClientsPool) (*Ruler, error) { 268 ruler := &Ruler{ 269 cfg: cfg, 270 store: ruleStore, 271 manager: manager, 272 registry: reg, 273 logger: logger, 274 limits: limits, 275 clientsPool: clientPool, 276 allowedTenants: util.NewAllowedTenants(cfg.EnabledTenants, cfg.DisabledTenants), 277 278 ringCheckErrors: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 279 Name: "cortex_ruler_ring_check_errors_total", 280 Help: "Number of errors that have occurred when checking the ring for ownership", 281 }), 282 283 rulerSync: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 284 Name: "cortex_ruler_sync_rules_total", 285 Help: "Total number of times the ruler sync operation triggered.", 286 }, []string{"reason"}), 287 } 288 289 if len(cfg.EnabledTenants) > 0 { 290 level.Info(ruler.logger).Log("msg", "ruler using enabled users", "enabled", strings.Join(cfg.EnabledTenants, ", ")) 291 } 292 if len(cfg.DisabledTenants) > 0 { 293 level.Info(ruler.logger).Log("msg", "ruler using disabled users", "disabled", strings.Join(cfg.DisabledTenants, ", ")) 294 } 295 296 if cfg.EnableSharding { 297 ringStore, err := kv.NewClient( 298 cfg.Ring.KVStore, 299 ring.GetCodec(), 300 kv.RegistererWithKVName(prometheus.WrapRegistererWithPrefix("cortex_", reg), "ruler"), 301 logger, 302 ) 303 if err != nil { 304 return nil, errors.Wrap(err, "create KV store client") 305 } 306 307 if err = enableSharding(ruler, ringStore); err != nil { 308 return nil, errors.Wrap(err, "setup ruler sharding ring") 309 } 310 } 311 312 ruler.Service = services.NewBasicService(ruler.starting, ruler.run, ruler.stopping) 313 return ruler, nil 314 } 315 316 func enableSharding(r *Ruler, ringStore kv.Client) error { 317 lifecyclerCfg, err := r.cfg.Ring.ToLifecyclerConfig(r.logger) 318 if err != nil { 319 return errors.Wrap(err, "failed to initialize ruler's lifecycler config") 320 } 321 322 // Define lifecycler delegates in reverse order (last to be called defined first because they're 323 // chained via "next delegate"). 324 delegate := ring.BasicLifecyclerDelegate(r) 325 delegate = ring.NewLeaveOnStoppingDelegate(delegate, r.logger) 326 delegate = ring.NewAutoForgetDelegate(r.cfg.Ring.HeartbeatTimeout*ringAutoForgetUnhealthyPeriods, delegate, r.logger) 327 328 rulerRingName := "ruler" 329 r.lifecycler, err = ring.NewBasicLifecycler(lifecyclerCfg, rulerRingName, ringKey, ringStore, delegate, r.logger, prometheus.WrapRegistererWithPrefix("cortex_", r.registry)) 330 if err != nil { 331 return errors.Wrap(err, "failed to initialize ruler's lifecycler") 332 } 333 334 r.ring, err = ring.NewWithStoreClientAndStrategy(r.cfg.Ring.ToRingConfig(), rulerRingName, ringKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), prometheus.WrapRegistererWithPrefix("cortex_", r.registry), r.logger) 335 if err != nil { 336 return errors.Wrap(err, "failed to initialize ruler's ring") 337 } 338 339 return nil 340 } 341 342 func (r *Ruler) starting(ctx context.Context) error { 343 // If sharding is enabled, start the used subservices. 344 if r.cfg.EnableSharding { 345 var err error 346 347 if r.subservices, err = services.NewManager(r.lifecycler, r.ring, r.clientsPool); err != nil { 348 return errors.Wrap(err, "unable to start ruler subservices") 349 } 350 351 r.subservicesWatcher = services.NewFailureWatcher() 352 r.subservicesWatcher.WatchManager(r.subservices) 353 354 if err = services.StartManagerAndAwaitHealthy(ctx, r.subservices); err != nil { 355 return errors.Wrap(err, "unable to start ruler subservices") 356 } 357 } 358 359 // TODO: ideally, ruler would wait until its queryable is finished starting. 360 return nil 361 } 362 363 // Stop stops the Ruler. 364 // Each function of the ruler is terminated before leaving the ring 365 func (r *Ruler) stopping(_ error) error { 366 r.manager.Stop() 367 368 if r.subservices != nil { 369 _ = services.StopManagerAndAwaitStopped(context.Background(), r.subservices) 370 } 371 return nil 372 } 373 374 type sender interface { 375 Send(alerts ...*notifier.Alert) 376 } 377 378 // SendAlerts implements a rules.NotifyFunc for a Notifier. 379 // It filters any non-firing alerts from the input. 380 // 381 // Copied from Prometheus's main.go. 382 func SendAlerts(n sender, externalURL string) promRules.NotifyFunc { 383 return func(ctx context.Context, expr string, alerts ...*promRules.Alert) { 384 var res []*notifier.Alert 385 386 for _, alert := range alerts { 387 a := ¬ifier.Alert{ 388 StartsAt: alert.FiredAt, 389 Labels: alert.Labels, 390 Annotations: alert.Annotations, 391 GeneratorURL: externalURL + strutil.TableLinkForExpression(expr), 392 } 393 if !alert.ResolvedAt.IsZero() { 394 a.EndsAt = alert.ResolvedAt 395 } else { 396 a.EndsAt = alert.ValidUntil 397 } 398 res = append(res, a) 399 } 400 401 if len(alerts) > 0 { 402 n.Send(res...) 403 } 404 } 405 } 406 407 var sep = []byte("/") 408 409 func tokenForGroup(g *rulespb.RuleGroupDesc) uint32 { 410 ringHasher := fnv.New32a() 411 412 // Hasher never returns err. 413 _, _ = ringHasher.Write([]byte(g.User)) 414 _, _ = ringHasher.Write(sep) 415 _, _ = ringHasher.Write([]byte(g.Namespace)) 416 _, _ = ringHasher.Write(sep) 417 _, _ = ringHasher.Write([]byte(g.Name)) 418 419 return ringHasher.Sum32() 420 } 421 422 func instanceOwnsRuleGroup(r ring.ReadRing, g *rulespb.RuleGroupDesc, instanceAddr string) (bool, error) { 423 hash := tokenForGroup(g) 424 425 rlrs, err := r.Get(hash, RingOp, nil, nil, nil) 426 if err != nil { 427 return false, errors.Wrap(err, "error reading ring to verify rule group ownership") 428 } 429 430 return rlrs.Instances[0].Addr == instanceAddr, nil 431 } 432 433 func (r *Ruler) ServeHTTP(w http.ResponseWriter, req *http.Request) { 434 if r.cfg.EnableSharding { 435 r.ring.ServeHTTP(w, req) 436 } else { 437 unshardedPage := ` 438 <!DOCTYPE html> 439 <html> 440 <head> 441 <meta charset="UTF-8"> 442 <title>Cortex Ruler Status</title> 443 </head> 444 <body> 445 <h1>Cortex Ruler Status</h1> 446 <p>Ruler running with shards disabled</p> 447 </body> 448 </html>` 449 util.WriteHTMLResponse(w, unshardedPage) 450 } 451 } 452 453 func (r *Ruler) run(ctx context.Context) error { 454 level.Info(r.logger).Log("msg", "ruler up and running") 455 456 tick := time.NewTicker(r.cfg.PollInterval) 457 defer tick.Stop() 458 459 var ringTickerChan <-chan time.Time 460 var ringLastState ring.ReplicationSet 461 462 if r.cfg.EnableSharding { 463 ringLastState, _ = r.ring.GetAllHealthy(RingOp) 464 ringTicker := time.NewTicker(util.DurationWithJitter(r.cfg.RingCheckPeriod, 0.2)) 465 defer ringTicker.Stop() 466 ringTickerChan = ringTicker.C 467 } 468 469 r.syncRules(ctx, rulerSyncReasonInitial) 470 for { 471 select { 472 case <-ctx.Done(): 473 return nil 474 case <-tick.C: 475 r.syncRules(ctx, rulerSyncReasonPeriodic) 476 case <-ringTickerChan: 477 // We ignore the error because in case of error it will return an empty 478 // replication set which we use to compare with the previous state. 479 currRingState, _ := r.ring.GetAllHealthy(RingOp) 480 481 if ring.HasReplicationSetChanged(ringLastState, currRingState) { 482 ringLastState = currRingState 483 r.syncRules(ctx, rulerSyncReasonRingChange) 484 } 485 case err := <-r.subservicesWatcher.Chan(): 486 return errors.Wrap(err, "ruler subservice failed") 487 } 488 } 489 } 490 491 func (r *Ruler) syncRules(ctx context.Context, reason string) { 492 level.Debug(r.logger).Log("msg", "syncing rules", "reason", reason) 493 r.rulerSync.WithLabelValues(reason).Inc() 494 495 configs, err := r.listRules(ctx) 496 if err != nil { 497 level.Error(r.logger).Log("msg", "unable to list rules", "err", err) 498 return 499 } 500 501 err = r.store.LoadRuleGroups(ctx, configs) 502 if err != nil { 503 level.Error(r.logger).Log("msg", "unable to load rules owned by this ruler", "err", err) 504 return 505 } 506 507 // This will also delete local group files for users that are no longer in 'configs' map. 508 r.manager.SyncRuleGroups(ctx, configs) 509 } 510 511 func (r *Ruler) listRules(ctx context.Context) (result map[string]rulespb.RuleGroupList, err error) { 512 switch { 513 case !r.cfg.EnableSharding: 514 result, err = r.listRulesNoSharding(ctx) 515 516 case r.cfg.ShardingStrategy == util.ShardingStrategyDefault: 517 result, err = r.listRulesShardingDefault(ctx) 518 519 case r.cfg.ShardingStrategy == util.ShardingStrategyShuffle: 520 result, err = r.listRulesShuffleSharding(ctx) 521 522 default: 523 return nil, errors.New("invalid sharding configuration") 524 } 525 526 if err != nil { 527 return 528 } 529 530 for userID := range result { 531 if !r.allowedTenants.IsAllowed(userID) { 532 level.Debug(r.logger).Log("msg", "ignoring rule groups for user, not allowed", "user", userID) 533 delete(result, userID) 534 } 535 } 536 return 537 } 538 539 func (r *Ruler) listRulesNoSharding(ctx context.Context) (map[string]rulespb.RuleGroupList, error) { 540 return r.store.ListAllRuleGroups(ctx) 541 } 542 543 func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulespb.RuleGroupList, error) { 544 configs, err := r.store.ListAllRuleGroups(ctx) 545 if err != nil { 546 return nil, err 547 } 548 549 filteredConfigs := make(map[string]rulespb.RuleGroupList) 550 for userID, groups := range configs { 551 filtered := filterRuleGroups(userID, groups, r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors) 552 if len(filtered) > 0 { 553 filteredConfigs[userID] = filtered 554 } 555 } 556 return filteredConfigs, nil 557 } 558 559 func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulespb.RuleGroupList, error) { 560 users, err := r.store.ListAllUsers(ctx) 561 if err != nil { 562 return nil, errors.Wrap(err, "unable to list users of ruler") 563 } 564 565 // Only users in userRings will be used in the to load the rules. 566 userRings := map[string]ring.ReadRing{} 567 for _, u := range users { 568 if shardSize := r.limits.RulerTenantShardSize(u); shardSize > 0 { 569 subRing := r.ring.ShuffleShard(u, shardSize) 570 571 // Include the user only if it belongs to this ruler shard. 572 if subRing.HasInstance(r.lifecycler.GetInstanceID()) { 573 userRings[u] = subRing 574 } 575 } else { 576 // A shard size of 0 means shuffle sharding is disabled for this specific user. 577 // In that case we use the full ring so that rule groups will be sharded across all rulers. 578 userRings[u] = r.ring 579 } 580 } 581 582 if len(userRings) == 0 { 583 return nil, nil 584 } 585 586 userCh := make(chan string, len(userRings)) 587 for u := range userRings { 588 userCh <- u 589 } 590 close(userCh) 591 592 mu := sync.Mutex{} 593 result := map[string]rulespb.RuleGroupList{} 594 595 concurrency := loadRulesConcurrency 596 if len(userRings) < concurrency { 597 concurrency = len(userRings) 598 } 599 600 g, gctx := errgroup.WithContext(ctx) 601 for i := 0; i < concurrency; i++ { 602 g.Go(func() error { 603 for userID := range userCh { 604 groups, err := r.store.ListRuleGroupsForUserAndNamespace(gctx, userID, "") 605 if err != nil { 606 return errors.Wrapf(err, "failed to fetch rule groups for user %s", userID) 607 } 608 609 filtered := filterRuleGroups(userID, groups, userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors) 610 if len(filtered) == 0 { 611 continue 612 } 613 614 mu.Lock() 615 result[userID] = filtered 616 mu.Unlock() 617 } 618 return nil 619 }) 620 } 621 622 err = g.Wait() 623 return result, err 624 } 625 626 // filterRuleGroups returns map of rule groups that given instance "owns" based on supplied ring. 627 // This function only uses User, Namespace, and Name fields of individual RuleGroups. 628 // 629 // Reason why this function is not a method on Ruler is to make sure we don't accidentally use r.ring, 630 // but only ring passed as parameter. 631 func filterRuleGroups(userID string, ruleGroups []*rulespb.RuleGroupDesc, ring ring.ReadRing, instanceAddr string, log log.Logger, ringCheckErrors prometheus.Counter) []*rulespb.RuleGroupDesc { 632 // Prune the rule group to only contain rules that this ruler is responsible for, based on ring. 633 var result []*rulespb.RuleGroupDesc 634 for _, g := range ruleGroups { 635 owned, err := instanceOwnsRuleGroup(ring, g, instanceAddr) 636 if err != nil { 637 ringCheckErrors.Inc() 638 level.Error(log).Log("msg", "failed to check if the ruler replica owns the rule group", "user", userID, "namespace", g.Namespace, "group", g.Name, "err", err) 639 continue 640 } 641 642 if owned { 643 level.Debug(log).Log("msg", "rule group owned", "user", g.User, "namespace", g.Namespace, "name", g.Name) 644 result = append(result, g) 645 } else { 646 level.Debug(log).Log("msg", "rule group not owned, ignoring", "user", g.User, "namespace", g.Namespace, "name", g.Name) 647 } 648 } 649 650 return result 651 } 652 653 // GetRules retrieves the running rules from this ruler and all running rulers in the ring if 654 // sharding is enabled 655 func (r *Ruler) GetRules(ctx context.Context) ([]*GroupStateDesc, error) { 656 userID, err := tenant.TenantID(ctx) 657 if err != nil { 658 return nil, fmt.Errorf("no user id found in context") 659 } 660 661 if r.cfg.EnableSharding { 662 return r.getShardedRules(ctx, userID) 663 } 664 665 return r.getLocalRules(userID) 666 } 667 668 func (r *Ruler) getLocalRules(userID string) ([]*GroupStateDesc, error) { 669 groups := r.manager.GetRules(userID) 670 671 groupDescs := make([]*GroupStateDesc, 0, len(groups)) 672 prefix := filepath.Join(r.cfg.RulePath, userID) + "/" 673 674 for _, group := range groups { 675 interval := group.Interval() 676 677 // The mapped filename is url path escaped encoded to make handling `/` characters easier 678 decodedNamespace, err := url.PathUnescape(strings.TrimPrefix(group.File(), prefix)) 679 if err != nil { 680 return nil, errors.Wrap(err, "unable to decode rule filename") 681 } 682 683 groupDesc := &GroupStateDesc{ 684 Group: &rulespb.RuleGroupDesc{ 685 Name: group.Name(), 686 Namespace: decodedNamespace, 687 Interval: interval, 688 User: userID, 689 }, 690 691 EvaluationTimestamp: group.GetLastEvaluation(), 692 EvaluationDuration: group.GetEvaluationTime(), 693 } 694 for _, r := range group.Rules() { 695 lastError := "" 696 if r.LastError() != nil { 697 lastError = r.LastError().Error() 698 } 699 700 var ruleDesc *RuleStateDesc 701 switch rule := r.(type) { 702 case *promRules.AlertingRule: 703 rule.ActiveAlerts() 704 alerts := []*AlertStateDesc{} 705 for _, a := range rule.ActiveAlerts() { 706 alerts = append(alerts, &AlertStateDesc{ 707 State: a.State.String(), 708 Labels: logproto.FromLabelsToLabelAdapters(a.Labels), 709 Annotations: logproto.FromLabelsToLabelAdapters(a.Annotations), 710 Value: a.Value, 711 ActiveAt: a.ActiveAt, 712 FiredAt: a.FiredAt, 713 ResolvedAt: a.ResolvedAt, 714 LastSentAt: a.LastSentAt, 715 ValidUntil: a.ValidUntil, 716 }) 717 } 718 ruleDesc = &RuleStateDesc{ 719 Rule: &rulespb.RuleDesc{ 720 Expr: rule.Query().String(), 721 Alert: rule.Name(), 722 For: rule.HoldDuration(), 723 Labels: logproto.FromLabelsToLabelAdapters(rule.Labels()), 724 Annotations: logproto.FromLabelsToLabelAdapters(rule.Annotations()), 725 }, 726 State: rule.State().String(), 727 Health: string(rule.Health()), 728 LastError: lastError, 729 Alerts: alerts, 730 EvaluationTimestamp: rule.GetEvaluationTimestamp(), 731 EvaluationDuration: rule.GetEvaluationDuration(), 732 } 733 case *promRules.RecordingRule: 734 ruleDesc = &RuleStateDesc{ 735 Rule: &rulespb.RuleDesc{ 736 Record: rule.Name(), 737 Expr: rule.Query().String(), 738 Labels: logproto.FromLabelsToLabelAdapters(rule.Labels()), 739 }, 740 Health: string(rule.Health()), 741 LastError: lastError, 742 EvaluationTimestamp: rule.GetEvaluationTimestamp(), 743 EvaluationDuration: rule.GetEvaluationDuration(), 744 } 745 default: 746 return nil, errors.Errorf("failed to assert type of rule '%v'", rule.Name()) 747 } 748 groupDesc.ActiveRules = append(groupDesc.ActiveRules, ruleDesc) 749 } 750 groupDescs = append(groupDescs, groupDesc) 751 } 752 return groupDescs, nil 753 } 754 755 func (r *Ruler) getShardedRules(ctx context.Context, userID string) ([]*GroupStateDesc, error) { 756 ring := ring.ReadRing(r.ring) 757 758 if shardSize := r.limits.RulerTenantShardSize(userID); shardSize > 0 && r.cfg.ShardingStrategy == util.ShardingStrategyShuffle { 759 ring = r.ring.ShuffleShard(userID, shardSize) 760 } 761 762 rulers, err := ring.GetReplicationSetForOperation(RingOp) 763 if err != nil { 764 return nil, err 765 } 766 767 ctx, err = user.InjectIntoGRPCRequest(ctx) 768 if err != nil { 769 return nil, fmt.Errorf("unable to inject user ID into grpc request, %v", err) 770 } 771 772 var ( 773 mergedMx sync.Mutex 774 merged []*GroupStateDesc 775 ) 776 777 // Concurrently fetch rules from all rulers. Since rules are not replicated, 778 // we need all requests to succeed. 779 addresses := rulers.GetAddresses() 780 err = concurrency.ForEachJob(ctx, len(addresses), len(addresses), func(ctx context.Context, idx int) error { 781 addr := addresses[idx] 782 783 rulerClient, err := r.clientsPool.GetClientFor(addr) 784 if err != nil { 785 return errors.Wrapf(err, "unable to get client for ruler %s", addr) 786 } 787 788 newGrps, err := rulerClient.Rules(ctx, &RulesRequest{}) 789 if err != nil { 790 return errors.Wrapf(err, "unable to retrieve rules from ruler %s", addr) 791 } 792 793 mergedMx.Lock() 794 merged = append(merged, newGrps.Groups...) 795 mergedMx.Unlock() 796 797 return nil 798 }) 799 800 return merged, err 801 } 802 803 // Rules implements the rules service 804 func (r *Ruler) Rules(ctx context.Context, in *RulesRequest) (*RulesResponse, error) { 805 userID, err := tenant.TenantID(ctx) 806 if err != nil { 807 return nil, fmt.Errorf("no user id found in context") 808 } 809 810 groupDescs, err := r.getLocalRules(userID) 811 if err != nil { 812 return nil, err 813 } 814 815 return &RulesResponse{Groups: groupDescs}, nil 816 } 817 818 // AssertMaxRuleGroups limit has not been reached compared to the current 819 // number of total rule groups in input and returns an error if so. 820 func (r *Ruler) AssertMaxRuleGroups(userID string, rg int) error { 821 limit := r.limits.RulerMaxRuleGroupsPerTenant(userID) 822 823 if limit <= 0 { 824 return nil 825 } 826 827 if rg <= limit { 828 return nil 829 } 830 831 return fmt.Errorf(errMaxRuleGroupsPerUserLimitExceeded, limit, rg) 832 } 833 834 // AssertMaxRulesPerRuleGroup limit has not been reached compared to the current 835 // number of rules in a rule group in input and returns an error if so. 836 func (r *Ruler) AssertMaxRulesPerRuleGroup(userID string, rules int) error { 837 limit := r.limits.RulerMaxRulesPerRuleGroup(userID) 838 839 if limit <= 0 { 840 return nil 841 } 842 843 if rules <= limit { 844 return nil 845 } 846 return fmt.Errorf(errMaxRulesPerRuleGroupPerUserLimitExceeded, limit, rules) 847 } 848 849 func (r *Ruler) DeleteTenantConfiguration(w http.ResponseWriter, req *http.Request) { 850 logger := util_log.WithContext(req.Context(), r.logger) 851 852 userID, err := tenant.TenantID(req.Context()) 853 if err != nil { 854 // When Cortex is running, it uses Auth Middleware for checking X-Scope-OrgID and injecting tenant into context. 855 // Auth Middleware sends http.StatusUnauthorized if X-Scope-OrgID is missing, so we do too here, for consistency. 856 http.Error(w, err.Error(), http.StatusUnauthorized) 857 return 858 } 859 860 err = r.store.DeleteNamespace(req.Context(), userID, "") // Empty namespace = delete all rule groups. 861 if err != nil && !errors.Is(err, rulestore.ErrGroupNamespaceNotFound) { 862 respondError(logger, w, err.Error()) 863 return 864 } 865 866 level.Info(logger).Log("msg", "deleted all tenant rule groups", "user", userID) 867 w.WriteHeader(http.StatusOK) 868 } 869 870 func (r *Ruler) ListAllRules(w http.ResponseWriter, req *http.Request) { 871 logger := util_log.WithContext(req.Context(), r.logger) 872 873 userIDs, err := r.store.ListAllUsers(req.Context()) 874 if err != nil { 875 level.Error(logger).Log("msg", errListAllUser, "err", err) 876 http.Error(w, fmt.Sprintf("%s: %s", errListAllUser, err.Error()), http.StatusInternalServerError) 877 return 878 } 879 880 done := make(chan struct{}) 881 iter := make(chan interface{}) 882 883 go func() { 884 util.StreamWriteYAMLResponse(w, iter, logger) 885 close(done) 886 }() 887 888 err = concurrency.ForEachUser(req.Context(), userIDs, fetchRulesConcurrency, func(ctx context.Context, userID string) error { 889 rg, err := r.store.ListRuleGroupsForUserAndNamespace(ctx, userID, "") 890 if err != nil { 891 return errors.Wrapf(err, "failed to fetch ruler config for user %s", userID) 892 } 893 userRules := map[string]rulespb.RuleGroupList{userID: rg} 894 if err := r.store.LoadRuleGroups(ctx, userRules); err != nil { 895 return errors.Wrapf(err, "failed to load ruler config for user %s", userID) 896 } 897 data := map[string]map[string][]rulefmt.RuleGroup{userID: userRules[userID].Formatted()} 898 899 select { 900 case iter <- data: 901 case <-done: // stop early, if sending response has already finished 902 } 903 904 return nil 905 }) 906 if err != nil { 907 level.Error(logger).Log("msg", "failed to list all ruler configs", "err", err) 908 } 909 close(iter) 910 <-done 911 }