github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/alertmanager/alertmanager.go (about) 1 package alertmanager 2 3 import ( 4 "context" 5 "crypto/md5" 6 "encoding/binary" 7 "fmt" 8 "net/http" 9 "net/url" 10 "path" 11 "path/filepath" 12 "strings" 13 "sync" 14 "time" 15 16 "github.com/go-kit/log" 17 "github.com/go-kit/log/level" 18 "github.com/grafana/dskit/flagext" 19 "github.com/grafana/dskit/services" 20 "github.com/pkg/errors" 21 "github.com/prometheus/alertmanager/api" 22 "github.com/prometheus/alertmanager/cluster" 23 "github.com/prometheus/alertmanager/cluster/clusterpb" 24 "github.com/prometheus/alertmanager/config" 25 "github.com/prometheus/alertmanager/dispatch" 26 "github.com/prometheus/alertmanager/inhibit" 27 "github.com/prometheus/alertmanager/nflog" 28 "github.com/prometheus/alertmanager/notify" 29 "github.com/prometheus/alertmanager/notify/email" 30 "github.com/prometheus/alertmanager/notify/opsgenie" 31 "github.com/prometheus/alertmanager/notify/pagerduty" 32 "github.com/prometheus/alertmanager/notify/pushover" 33 "github.com/prometheus/alertmanager/notify/slack" 34 "github.com/prometheus/alertmanager/notify/sns" 35 "github.com/prometheus/alertmanager/notify/victorops" 36 "github.com/prometheus/alertmanager/notify/webhook" 37 "github.com/prometheus/alertmanager/notify/wechat" 38 "github.com/prometheus/alertmanager/provider/mem" 39 "github.com/prometheus/alertmanager/silence" 40 "github.com/prometheus/alertmanager/template" 41 "github.com/prometheus/alertmanager/timeinterval" 42 "github.com/prometheus/alertmanager/types" 43 "github.com/prometheus/alertmanager/ui" 44 "github.com/prometheus/client_golang/prometheus" 45 "github.com/prometheus/client_golang/prometheus/promauto" 46 commoncfg "github.com/prometheus/common/config" 47 "github.com/prometheus/common/model" 48 "github.com/prometheus/common/route" 49 "golang.org/x/time/rate" 50 51 "github.com/cortexproject/cortex/pkg/alertmanager/alertstore" 52 util_net "github.com/cortexproject/cortex/pkg/util/net" 53 ) 54 55 const ( 56 // MaintenancePeriod is used for periodic storing of silences and notifications to local file. 57 maintenancePeriod = 15 * time.Minute 58 59 // Filenames used within tenant-directory 60 notificationLogSnapshot = "notifications" 61 silencesSnapshot = "silences" 62 templatesDir = "templates" 63 ) 64 65 // Config configures an Alertmanager. 66 type Config struct { 67 UserID string 68 Logger log.Logger 69 Peer *cluster.Peer 70 PeerTimeout time.Duration 71 Retention time.Duration 72 ExternalURL *url.URL 73 Limits Limits 74 75 // Tenant-specific local directory where AM can store its state (notifications, silences, templates). When AM is stopped, entire dir is removed. 76 TenantDataDir string 77 78 ShardingEnabled bool 79 ReplicationFactor int 80 Replicator Replicator 81 Store alertstore.AlertStore 82 PersisterConfig PersisterConfig 83 } 84 85 // An Alertmanager manages the alerts for one user. 86 type Alertmanager struct { 87 cfg *Config 88 api *api.API 89 logger log.Logger 90 state State 91 persister *statePersister 92 nflog *nflog.Log 93 silences *silence.Silences 94 marker types.Marker 95 alerts *mem.Alerts 96 dispatcher *dispatch.Dispatcher 97 inhibitor *inhibit.Inhibitor 98 pipelineBuilder *notify.PipelineBuilder 99 stop chan struct{} 100 wg sync.WaitGroup 101 mux *http.ServeMux 102 registry *prometheus.Registry 103 104 // Pipeline created during last ApplyConfig call. Used for testing only. 105 lastPipeline notify.Stage 106 107 // The Dispatcher is the only component we need to recreate when we call ApplyConfig. 108 // Given its metrics don't have any variable labels we need to re-use the same metrics. 109 dispatcherMetrics *dispatch.DispatcherMetrics 110 // This needs to be set to the hash of the config. All the hashes need to be same 111 // for deduping of alerts to work, hence we need this metric. See https://github.com/prometheus/alertmanager/issues/596 112 // Further, in upstream AM, this metric is handled using the config coordinator which we don't use 113 // hence we need to generate the metric ourselves. 114 configHashMetric prometheus.Gauge 115 116 rateLimitedNotifications *prometheus.CounterVec 117 } 118 119 var ( 120 webReload = make(chan chan error) 121 ) 122 123 func init() { 124 go func() { 125 // Since this is not a "normal" Alertmanager which reads its config 126 // from disk, we just accept and ignore web-based reload signals. Config 127 // updates are only applied externally via ApplyConfig(). 128 for range webReload { 129 } 130 }() 131 } 132 133 // State helps with replication and synchronization of notifications and silences across several alertmanager replicas. 134 type State interface { 135 AddState(string, cluster.State, prometheus.Registerer) cluster.ClusterChannel 136 Position() int 137 WaitReady(context.Context) error 138 } 139 140 // Replicator is used to exchange state with peers via the ring when sharding is enabled. 141 type Replicator interface { 142 // ReplicateStateForUser writes the given partial state to the necessary replicas. 143 ReplicateStateForUser(ctx context.Context, userID string, part *clusterpb.Part) error 144 // The alertmanager replication protocol relies on a position related to other replicas. 145 // This position is then used to identify who should notify about the alert first. 146 GetPositionForUser(userID string) int 147 // ReadFullStateForUser obtains the full state from other replicas in the cluster. 148 ReadFullStateForUser(context.Context, string) ([]*clusterpb.FullState, error) 149 } 150 151 // New creates a new Alertmanager. 152 func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { 153 if cfg.TenantDataDir == "" { 154 return nil, fmt.Errorf("directory for tenant-specific AlertManager is not configured") 155 } 156 157 am := &Alertmanager{ 158 cfg: cfg, 159 logger: log.With(cfg.Logger, "user", cfg.UserID), 160 stop: make(chan struct{}), 161 configHashMetric: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 162 Name: "alertmanager_config_hash", 163 Help: "Hash of the currently loaded alertmanager configuration.", 164 }), 165 166 rateLimitedNotifications: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 167 Name: "alertmanager_notification_rate_limited_total", 168 Help: "Number of rate-limited notifications per integration.", 169 }, []string{"integration"}), // "integration" is consistent with other alertmanager metrics. 170 171 } 172 173 am.registry = reg 174 175 // We currently have 3 operational modes: 176 // 1) Alertmanager clustering with upstream Gossip 177 // 2) Alertmanager sharding and ring-based replication 178 // 3) Alertmanager no replication 179 // These are covered in order. 180 if cfg.Peer != nil { 181 level.Debug(am.logger).Log("msg", "starting tenant alertmanager with gossip-based replication") 182 am.state = cfg.Peer 183 } else if cfg.ShardingEnabled { 184 level.Debug(am.logger).Log("msg", "starting tenant alertmanager with ring-based replication") 185 state := newReplicatedStates(cfg.UserID, cfg.ReplicationFactor, cfg.Replicator, cfg.Store, am.logger, am.registry) 186 am.state = state 187 am.persister = newStatePersister(cfg.PersisterConfig, cfg.UserID, state, cfg.Store, am.logger, am.registry) 188 } else { 189 level.Debug(am.logger).Log("msg", "starting tenant alertmanager without replication") 190 am.state = &NilPeer{} 191 } 192 193 am.wg.Add(1) 194 var err error 195 am.nflog, err = nflog.New( 196 nflog.WithRetention(cfg.Retention), 197 nflog.WithSnapshot(filepath.Join(cfg.TenantDataDir, notificationLogSnapshot)), 198 nflog.WithMaintenance(maintenancePeriod, am.stop, am.wg.Done, nil), 199 nflog.WithMetrics(am.registry), 200 nflog.WithLogger(log.With(am.logger, "component", "nflog")), 201 ) 202 if err != nil { 203 return nil, fmt.Errorf("failed to create notification log: %v", err) 204 } 205 206 c := am.state.AddState("nfl:"+cfg.UserID, am.nflog, am.registry) 207 am.nflog.SetBroadcast(c.Broadcast) 208 209 am.marker = types.NewMarker(am.registry) 210 211 silencesFile := filepath.Join(cfg.TenantDataDir, silencesSnapshot) 212 am.silences, err = silence.New(silence.Options{ 213 SnapshotFile: silencesFile, 214 Retention: cfg.Retention, 215 Logger: log.With(am.logger, "component", "silences"), 216 Metrics: am.registry, 217 }) 218 if err != nil { 219 return nil, fmt.Errorf("failed to create silences: %v", err) 220 } 221 222 c = am.state.AddState("sil:"+cfg.UserID, am.silences, am.registry) 223 am.silences.SetBroadcast(c.Broadcast) 224 225 // State replication needs to be started after the state keys are defined. 226 if service, ok := am.state.(services.Service); ok { 227 if err := service.StartAsync(context.Background()); err != nil { 228 return nil, errors.Wrap(err, "failed to start ring-based replication service") 229 } 230 } 231 232 if am.persister != nil { 233 if err := am.persister.StartAsync(context.Background()); err != nil { 234 return nil, errors.Wrap(err, "failed to start state persister service") 235 } 236 } 237 238 am.pipelineBuilder = notify.NewPipelineBuilder(am.registry) 239 240 am.wg.Add(1) 241 go func() { 242 am.silences.Maintenance(maintenancePeriod, silencesFile, am.stop, nil) 243 am.wg.Done() 244 }() 245 246 var callback mem.AlertStoreCallback 247 if am.cfg.Limits != nil { 248 callback = newAlertsLimiter(am.cfg.UserID, am.cfg.Limits, reg) 249 } 250 251 am.alerts, err = mem.NewAlerts(context.Background(), am.marker, 30*time.Minute, callback, am.logger) 252 if err != nil { 253 return nil, fmt.Errorf("failed to create alerts: %v", err) 254 } 255 256 am.api, err = api.New(api.Options{ 257 Alerts: am.alerts, 258 Silences: am.silences, 259 StatusFunc: am.marker.Status, 260 // Cortex should not expose cluster information back to its tenants. 261 Peer: &NilPeer{}, 262 Registry: am.registry, 263 Logger: log.With(am.logger, "component", "api"), 264 GroupFunc: func(f1 func(*dispatch.Route) bool, f2 func(*types.Alert, time.Time) bool) (dispatch.AlertGroups, map[model.Fingerprint][]string) { 265 return am.dispatcher.Groups(f1, f2) 266 }, 267 }) 268 if err != nil { 269 return nil, fmt.Errorf("failed to create api: %v", err) 270 } 271 272 router := route.New().WithPrefix(am.cfg.ExternalURL.Path) 273 274 ui.Register(router, webReload, log.With(am.logger, "component", "ui")) 275 am.mux = am.api.Register(router, am.cfg.ExternalURL.Path) 276 277 // Override some extra paths registered in the router (eg. /metrics which by default exposes prometheus.DefaultRegisterer). 278 // Entire router is registered in Mux to "/" path, so there is no conflict with overwriting specific paths. 279 for _, p := range []string{"/metrics", "/-/reload", "/debug/"} { 280 a := path.Join(am.cfg.ExternalURL.Path, p) 281 // Preserve end slash, as for Mux it means entire subtree. 282 if strings.HasSuffix(p, "/") { 283 a = a + "/" 284 } 285 am.mux.Handle(a, http.NotFoundHandler()) 286 } 287 288 am.dispatcherMetrics = dispatch.NewDispatcherMetrics(true, am.registry) 289 290 //TODO: From this point onward, the alertmanager _might_ receive requests - we need to make sure we've settled and are ready. 291 return am, nil 292 } 293 294 func (am *Alertmanager) WaitInitialStateSync(ctx context.Context) error { 295 if service, ok := am.state.(services.Service); ok { 296 if err := service.AwaitRunning(ctx); err != nil { 297 return errors.Wrap(err, "failed to wait for ring-based replication service") 298 } 299 } 300 return nil 301 } 302 303 // clusterWait returns a function that inspects the current peer state and returns 304 // a duration of one base timeout for each peer with a higher ID than ourselves. 305 func clusterWait(position func() int, timeout time.Duration) func() time.Duration { 306 return func() time.Duration { 307 return time.Duration(position()) * timeout 308 } 309 } 310 311 // ApplyConfig applies a new configuration to an Alertmanager. 312 func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config, rawCfg string) error { 313 templateFiles := make([]string, len(conf.Templates)) 314 for i, t := range conf.Templates { 315 templateFilepath, err := safeTemplateFilepath(filepath.Join(am.cfg.TenantDataDir, templatesDir), t) 316 if err != nil { 317 return err 318 } 319 320 templateFiles[i] = templateFilepath 321 } 322 323 tmpl, err := template.FromGlobs(templateFiles...) 324 if err != nil { 325 return err 326 } 327 tmpl.ExternalURL = am.cfg.ExternalURL 328 329 am.api.Update(conf, func(_ model.LabelSet) {}) 330 331 // Ensure inhibitor is set before being called 332 if am.inhibitor != nil { 333 am.inhibitor.Stop() 334 } 335 336 // Ensure dispatcher is set before being called 337 if am.dispatcher != nil { 338 am.dispatcher.Stop() 339 } 340 341 am.inhibitor = inhibit.NewInhibitor(am.alerts, conf.InhibitRules, am.marker, log.With(am.logger, "component", "inhibitor")) 342 343 waitFunc := clusterWait(am.state.Position, am.cfg.PeerTimeout) 344 345 timeoutFunc := func(d time.Duration) time.Duration { 346 if d < notify.MinTimeout { 347 d = notify.MinTimeout 348 } 349 return d + waitFunc() 350 } 351 352 // Create a firewall binded to the per-tenant config. 353 firewallDialer := util_net.NewFirewallDialer(newFirewallDialerConfigProvider(userID, am.cfg.Limits)) 354 355 integrationsMap, err := buildIntegrationsMap(conf.Receivers, tmpl, firewallDialer, am.logger, func(integrationName string, notifier notify.Notifier) notify.Notifier { 356 if am.cfg.Limits != nil { 357 rl := &tenantRateLimits{ 358 tenant: userID, 359 limits: am.cfg.Limits, 360 integration: integrationName, 361 } 362 363 return newRateLimitedNotifier(notifier, rl, 10*time.Second, am.rateLimitedNotifications.WithLabelValues(integrationName)) 364 } 365 return notifier 366 }) 367 if err != nil { 368 return nil 369 } 370 371 muteTimes := make(map[string][]timeinterval.TimeInterval, len(conf.MuteTimeIntervals)) 372 for _, ti := range conf.MuteTimeIntervals { 373 muteTimes[ti.Name] = ti.TimeIntervals 374 } 375 376 pipeline := am.pipelineBuilder.New( 377 integrationsMap, 378 waitFunc, 379 am.inhibitor, 380 silence.NewSilencer(am.silences, am.marker, am.logger), 381 muteTimes, 382 am.nflog, 383 am.state, 384 ) 385 am.lastPipeline = pipeline 386 am.dispatcher = dispatch.NewDispatcher( 387 am.alerts, 388 dispatch.NewRoute(conf.Route, nil), 389 pipeline, 390 am.marker, 391 timeoutFunc, 392 &dispatcherLimits{tenant: am.cfg.UserID, limits: am.cfg.Limits}, 393 log.With(am.logger, "component", "dispatcher"), 394 am.dispatcherMetrics, 395 ) 396 397 go am.dispatcher.Run() 398 go am.inhibitor.Run() 399 400 am.configHashMetric.Set(md5HashAsMetricValue([]byte(rawCfg))) 401 return nil 402 } 403 404 // Stop stops the Alertmanager. 405 func (am *Alertmanager) Stop() { 406 if am.inhibitor != nil { 407 am.inhibitor.Stop() 408 } 409 410 if am.dispatcher != nil { 411 am.dispatcher.Stop() 412 } 413 414 if am.persister != nil { 415 am.persister.StopAsync() 416 } 417 418 if service, ok := am.state.(services.Service); ok { 419 service.StopAsync() 420 } 421 422 am.alerts.Close() 423 close(am.stop) 424 } 425 426 func (am *Alertmanager) StopAndWait() { 427 am.Stop() 428 429 if am.persister != nil { 430 if err := am.persister.AwaitTerminated(context.Background()); err != nil { 431 level.Warn(am.logger).Log("msg", "error while stopping state persister service", "err", err) 432 } 433 } 434 435 if service, ok := am.state.(services.Service); ok { 436 if err := service.AwaitTerminated(context.Background()); err != nil { 437 level.Warn(am.logger).Log("msg", "error while stopping ring-based replication service", "err", err) 438 } 439 } 440 441 am.wg.Wait() 442 } 443 444 func (am *Alertmanager) mergePartialExternalState(part *clusterpb.Part) error { 445 if state, ok := am.state.(*state); ok { 446 return state.MergePartialState(part) 447 } 448 return errors.New("ring-based sharding not enabled") 449 } 450 451 func (am *Alertmanager) getFullState() (*clusterpb.FullState, error) { 452 if state, ok := am.state.(*state); ok { 453 return state.GetFullState() 454 } 455 return nil, errors.New("ring-based sharding not enabled") 456 } 457 458 // buildIntegrationsMap builds a map of name to the list of integration notifiers off of a 459 // list of receiver config. 460 func buildIntegrationsMap(nc []*config.Receiver, tmpl *template.Template, firewallDialer *util_net.FirewallDialer, logger log.Logger, notifierWrapper func(string, notify.Notifier) notify.Notifier) (map[string][]notify.Integration, error) { 461 integrationsMap := make(map[string][]notify.Integration, len(nc)) 462 for _, rcv := range nc { 463 integrations, err := buildReceiverIntegrations(rcv, tmpl, firewallDialer, logger, notifierWrapper) 464 if err != nil { 465 return nil, err 466 } 467 integrationsMap[rcv.Name] = integrations 468 } 469 return integrationsMap, nil 470 } 471 472 // buildReceiverIntegrations builds a list of integration notifiers off of a 473 // receiver config. 474 // Taken from https://github.com/prometheus/alertmanager/blob/94d875f1227b29abece661db1a68c001122d1da5/cmd/alertmanager/main.go#L112-L159. 475 func buildReceiverIntegrations(nc *config.Receiver, tmpl *template.Template, firewallDialer *util_net.FirewallDialer, logger log.Logger, wrapper func(string, notify.Notifier) notify.Notifier) ([]notify.Integration, error) { 476 var ( 477 errs types.MultiError 478 integrations []notify.Integration 479 add = func(name string, i int, rs notify.ResolvedSender, f func(l log.Logger) (notify.Notifier, error)) { 480 n, err := f(log.With(logger, "integration", name)) 481 if err != nil { 482 errs.Add(err) 483 return 484 } 485 n = wrapper(name, n) 486 integrations = append(integrations, notify.NewIntegration(n, rs, name, i)) 487 } 488 ) 489 490 // Inject the firewall to any receiver integration supporting it. 491 httpOps := []commoncfg.HTTPClientOption{ 492 commoncfg.WithDialContextFunc(firewallDialer.DialContext), 493 } 494 495 for i, c := range nc.WebhookConfigs { 496 add("webhook", i, c, func(l log.Logger) (notify.Notifier, error) { return webhook.New(c, tmpl, l, httpOps...) }) 497 } 498 for i, c := range nc.EmailConfigs { 499 add("email", i, c, func(l log.Logger) (notify.Notifier, error) { return email.New(c, tmpl, l), nil }) 500 } 501 for i, c := range nc.PagerdutyConfigs { 502 add("pagerduty", i, c, func(l log.Logger) (notify.Notifier, error) { return pagerduty.New(c, tmpl, l, httpOps...) }) 503 } 504 for i, c := range nc.OpsGenieConfigs { 505 add("opsgenie", i, c, func(l log.Logger) (notify.Notifier, error) { return opsgenie.New(c, tmpl, l, httpOps...) }) 506 } 507 for i, c := range nc.WechatConfigs { 508 add("wechat", i, c, func(l log.Logger) (notify.Notifier, error) { return wechat.New(c, tmpl, l, httpOps...) }) 509 } 510 for i, c := range nc.SlackConfigs { 511 add("slack", i, c, func(l log.Logger) (notify.Notifier, error) { return slack.New(c, tmpl, l, httpOps...) }) 512 } 513 for i, c := range nc.VictorOpsConfigs { 514 add("victorops", i, c, func(l log.Logger) (notify.Notifier, error) { return victorops.New(c, tmpl, l, httpOps...) }) 515 } 516 for i, c := range nc.PushoverConfigs { 517 add("pushover", i, c, func(l log.Logger) (notify.Notifier, error) { return pushover.New(c, tmpl, l, httpOps...) }) 518 } 519 for i, c := range nc.SNSConfigs { 520 add("sns", i, c, func(l log.Logger) (notify.Notifier, error) { return sns.New(c, tmpl, l, httpOps...) }) 521 } 522 // If we add support for more integrations, we need to add them to validation as well. See validation.allowedIntegrationNames field. 523 if errs.Len() > 0 { 524 return nil, &errs 525 } 526 return integrations, nil 527 } 528 529 func md5HashAsMetricValue(data []byte) float64 { 530 sum := md5.Sum(data) 531 // We only want 48 bits as a float64 only has a 53 bit mantissa. 532 smallSum := sum[0:6] 533 var bytes = make([]byte, 8) 534 copy(bytes, smallSum) 535 return float64(binary.LittleEndian.Uint64(bytes)) 536 } 537 538 // NilPeer and NilChannel implements the Alertmanager clustering interface used by the API to expose cluster information. 539 // In a multi-tenant environment, we choose not to expose these to tenants and thus are not implemented. 540 type NilPeer struct{} 541 542 func (p *NilPeer) Name() string { return "" } 543 func (p *NilPeer) Status() string { return "ready" } 544 func (p *NilPeer) Peers() []cluster.ClusterMember { return nil } 545 func (p *NilPeer) Position() int { return 0 } 546 func (p *NilPeer) WaitReady(context.Context) error { return nil } 547 func (p *NilPeer) AddState(string, cluster.State, prometheus.Registerer) cluster.ClusterChannel { 548 return &NilChannel{} 549 } 550 551 type NilChannel struct{} 552 553 func (c *NilChannel) Broadcast([]byte) {} 554 555 type firewallDialerConfigProvider struct { 556 userID string 557 limits Limits 558 } 559 560 func newFirewallDialerConfigProvider(userID string, limits Limits) firewallDialerConfigProvider { 561 return firewallDialerConfigProvider{ 562 userID: userID, 563 limits: limits, 564 } 565 } 566 567 func (p firewallDialerConfigProvider) BlockCIDRNetworks() []flagext.CIDR { 568 return p.limits.AlertmanagerReceiversBlockCIDRNetworks(p.userID) 569 } 570 571 func (p firewallDialerConfigProvider) BlockPrivateAddresses() bool { 572 return p.limits.AlertmanagerReceiversBlockPrivateAddresses(p.userID) 573 } 574 575 type tenantRateLimits struct { 576 tenant string 577 integration string 578 limits Limits 579 } 580 581 func (t *tenantRateLimits) RateLimit() rate.Limit { 582 return t.limits.NotificationRateLimit(t.tenant, t.integration) 583 } 584 585 func (t *tenantRateLimits) Burst() int { 586 return t.limits.NotificationBurstSize(t.tenant, t.integration) 587 } 588 589 type dispatcherLimits struct { 590 tenant string 591 limits Limits 592 } 593 594 func (g *dispatcherLimits) MaxNumberOfAggregationGroups() int { 595 return g.limits.AlertmanagerMaxDispatcherAggregationGroups(g.tenant) 596 } 597 598 var ( 599 errTooManyAlerts = "too many alerts, limit: %d" 600 errAlertsTooBig = "alerts too big, total size limit: %d bytes" 601 ) 602 603 // alertsLimiter limits the number and size of alerts being received by the Alertmanager. 604 // We consider an alert unique based on its fingerprint (a hash of its labels) and 605 // its size it's determined by the sum of bytes of its labels, annotations, and generator URL. 606 type alertsLimiter struct { 607 tenant string 608 limits Limits 609 610 failureCounter prometheus.Counter 611 612 mx sync.Mutex 613 sizes map[model.Fingerprint]int 614 count int 615 totalSize int 616 } 617 618 func newAlertsLimiter(tenant string, limits Limits, reg prometheus.Registerer) *alertsLimiter { 619 limiter := &alertsLimiter{ 620 tenant: tenant, 621 limits: limits, 622 sizes: map[model.Fingerprint]int{}, 623 failureCounter: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 624 Name: "alertmanager_alerts_insert_limited_total", 625 Help: "Number of failures to insert new alerts to in-memory alert store.", 626 }), 627 } 628 629 promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ 630 Name: "alertmanager_alerts_limiter_current_alerts", 631 Help: "Number of alerts tracked by alerts limiter.", 632 }, func() float64 { 633 c, _ := limiter.currentStats() 634 return float64(c) 635 }) 636 637 promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ 638 Name: "alertmanager_alerts_limiter_current_alerts_size_bytes", 639 Help: "Total size of alerts tracked by alerts limiter.", 640 }, func() float64 { 641 _, s := limiter.currentStats() 642 return float64(s) 643 }) 644 645 return limiter 646 } 647 648 func (a *alertsLimiter) PreStore(alert *types.Alert, existing bool) error { 649 if alert == nil { 650 return nil 651 } 652 653 fp := alert.Fingerprint() 654 655 countLimit := a.limits.AlertmanagerMaxAlertsCount(a.tenant) 656 sizeLimit := a.limits.AlertmanagerMaxAlertsSizeBytes(a.tenant) 657 658 sizeDiff := alertSize(alert.Alert) 659 660 a.mx.Lock() 661 defer a.mx.Unlock() 662 663 if !existing && countLimit > 0 && (a.count+1) > countLimit { 664 a.failureCounter.Inc() 665 return fmt.Errorf(errTooManyAlerts, countLimit) 666 } 667 668 if existing { 669 sizeDiff -= a.sizes[fp] 670 } 671 672 if sizeLimit > 0 && (a.totalSize+sizeDiff) > sizeLimit { 673 a.failureCounter.Inc() 674 return fmt.Errorf(errAlertsTooBig, sizeLimit) 675 } 676 677 return nil 678 } 679 680 func (a *alertsLimiter) PostStore(alert *types.Alert, existing bool) { 681 if alert == nil { 682 return 683 } 684 685 newSize := alertSize(alert.Alert) 686 fp := alert.Fingerprint() 687 688 a.mx.Lock() 689 defer a.mx.Unlock() 690 691 if existing { 692 a.totalSize -= a.sizes[fp] 693 } else { 694 a.count++ 695 } 696 a.sizes[fp] = newSize 697 a.totalSize += newSize 698 } 699 700 func (a *alertsLimiter) PostDelete(alert *types.Alert) { 701 if alert == nil { 702 return 703 } 704 705 fp := alert.Fingerprint() 706 707 a.mx.Lock() 708 defer a.mx.Unlock() 709 710 a.totalSize -= a.sizes[fp] 711 delete(a.sizes, fp) 712 a.count-- 713 } 714 715 func (a *alertsLimiter) currentStats() (count, totalSize int) { 716 a.mx.Lock() 717 defer a.mx.Unlock() 718 719 return a.count, a.totalSize 720 } 721 722 func alertSize(alert model.Alert) int { 723 size := 0 724 for l, v := range alert.Labels { 725 size += len(l) 726 size += len(v) 727 } 728 for l, v := range alert.Annotations { 729 size += len(l) 730 size += len(v) 731 } 732 size += len(alert.GeneratorURL) 733 return size 734 }