github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/alertmanager/alertmanager.go (about)

     1  package alertmanager
     2  
     3  import (
     4  	"context"
     5  	"crypto/md5"
     6  	"encoding/binary"
     7  	"fmt"
     8  	"net/http"
     9  	"net/url"
    10  	"path"
    11  	"path/filepath"
    12  	"strings"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/go-kit/log"
    17  	"github.com/go-kit/log/level"
    18  	"github.com/grafana/dskit/flagext"
    19  	"github.com/grafana/dskit/services"
    20  	"github.com/pkg/errors"
    21  	"github.com/prometheus/alertmanager/api"
    22  	"github.com/prometheus/alertmanager/cluster"
    23  	"github.com/prometheus/alertmanager/cluster/clusterpb"
    24  	"github.com/prometheus/alertmanager/config"
    25  	"github.com/prometheus/alertmanager/dispatch"
    26  	"github.com/prometheus/alertmanager/inhibit"
    27  	"github.com/prometheus/alertmanager/nflog"
    28  	"github.com/prometheus/alertmanager/notify"
    29  	"github.com/prometheus/alertmanager/notify/email"
    30  	"github.com/prometheus/alertmanager/notify/opsgenie"
    31  	"github.com/prometheus/alertmanager/notify/pagerduty"
    32  	"github.com/prometheus/alertmanager/notify/pushover"
    33  	"github.com/prometheus/alertmanager/notify/slack"
    34  	"github.com/prometheus/alertmanager/notify/sns"
    35  	"github.com/prometheus/alertmanager/notify/victorops"
    36  	"github.com/prometheus/alertmanager/notify/webhook"
    37  	"github.com/prometheus/alertmanager/notify/wechat"
    38  	"github.com/prometheus/alertmanager/provider/mem"
    39  	"github.com/prometheus/alertmanager/silence"
    40  	"github.com/prometheus/alertmanager/template"
    41  	"github.com/prometheus/alertmanager/timeinterval"
    42  	"github.com/prometheus/alertmanager/types"
    43  	"github.com/prometheus/alertmanager/ui"
    44  	"github.com/prometheus/client_golang/prometheus"
    45  	"github.com/prometheus/client_golang/prometheus/promauto"
    46  	commoncfg "github.com/prometheus/common/config"
    47  	"github.com/prometheus/common/model"
    48  	"github.com/prometheus/common/route"
    49  	"golang.org/x/time/rate"
    50  
    51  	"github.com/cortexproject/cortex/pkg/alertmanager/alertstore"
    52  	util_net "github.com/cortexproject/cortex/pkg/util/net"
    53  )
    54  
    55  const (
    56  	// MaintenancePeriod is used for periodic storing of silences and notifications to local file.
    57  	maintenancePeriod = 15 * time.Minute
    58  
    59  	// Filenames used within tenant-directory
    60  	notificationLogSnapshot = "notifications"
    61  	silencesSnapshot        = "silences"
    62  	templatesDir            = "templates"
    63  )
    64  
    65  // Config configures an Alertmanager.
    66  type Config struct {
    67  	UserID      string
    68  	Logger      log.Logger
    69  	Peer        *cluster.Peer
    70  	PeerTimeout time.Duration
    71  	Retention   time.Duration
    72  	ExternalURL *url.URL
    73  	Limits      Limits
    74  
    75  	// Tenant-specific local directory where AM can store its state (notifications, silences, templates). When AM is stopped, entire dir is removed.
    76  	TenantDataDir string
    77  
    78  	ShardingEnabled   bool
    79  	ReplicationFactor int
    80  	Replicator        Replicator
    81  	Store             alertstore.AlertStore
    82  	PersisterConfig   PersisterConfig
    83  }
    84  
    85  // An Alertmanager manages the alerts for one user.
    86  type Alertmanager struct {
    87  	cfg             *Config
    88  	api             *api.API
    89  	logger          log.Logger
    90  	state           State
    91  	persister       *statePersister
    92  	nflog           *nflog.Log
    93  	silences        *silence.Silences
    94  	marker          types.Marker
    95  	alerts          *mem.Alerts
    96  	dispatcher      *dispatch.Dispatcher
    97  	inhibitor       *inhibit.Inhibitor
    98  	pipelineBuilder *notify.PipelineBuilder
    99  	stop            chan struct{}
   100  	wg              sync.WaitGroup
   101  	mux             *http.ServeMux
   102  	registry        *prometheus.Registry
   103  
   104  	// Pipeline created during last ApplyConfig call. Used for testing only.
   105  	lastPipeline notify.Stage
   106  
   107  	// The Dispatcher is the only component we need to recreate when we call ApplyConfig.
   108  	// Given its metrics don't have any variable labels we need to re-use the same metrics.
   109  	dispatcherMetrics *dispatch.DispatcherMetrics
   110  	// This needs to be set to the hash of the config. All the hashes need to be same
   111  	// for deduping of alerts to work, hence we need this metric. See https://github.com/prometheus/alertmanager/issues/596
   112  	// Further, in upstream AM, this metric is handled using the config coordinator which we don't use
   113  	// hence we need to generate the metric ourselves.
   114  	configHashMetric prometheus.Gauge
   115  
   116  	rateLimitedNotifications *prometheus.CounterVec
   117  }
   118  
   119  var (
   120  	webReload = make(chan chan error)
   121  )
   122  
   123  func init() {
   124  	go func() {
   125  		// Since this is not a "normal" Alertmanager which reads its config
   126  		// from disk, we just accept and ignore web-based reload signals. Config
   127  		// updates are only applied externally via ApplyConfig().
   128  		for range webReload {
   129  		}
   130  	}()
   131  }
   132  
   133  // State helps with replication and synchronization of notifications and silences across several alertmanager replicas.
   134  type State interface {
   135  	AddState(string, cluster.State, prometheus.Registerer) cluster.ClusterChannel
   136  	Position() int
   137  	WaitReady(context.Context) error
   138  }
   139  
   140  // Replicator is used to exchange state with peers via the ring when sharding is enabled.
   141  type Replicator interface {
   142  	// ReplicateStateForUser writes the given partial state to the necessary replicas.
   143  	ReplicateStateForUser(ctx context.Context, userID string, part *clusterpb.Part) error
   144  	// The alertmanager replication protocol relies on a position related to other replicas.
   145  	// This position is then used to identify who should notify about the alert first.
   146  	GetPositionForUser(userID string) int
   147  	// ReadFullStateForUser obtains the full state from other replicas in the cluster.
   148  	ReadFullStateForUser(context.Context, string) ([]*clusterpb.FullState, error)
   149  }
   150  
   151  // New creates a new Alertmanager.
   152  func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
   153  	if cfg.TenantDataDir == "" {
   154  		return nil, fmt.Errorf("directory for tenant-specific AlertManager is not configured")
   155  	}
   156  
   157  	am := &Alertmanager{
   158  		cfg:    cfg,
   159  		logger: log.With(cfg.Logger, "user", cfg.UserID),
   160  		stop:   make(chan struct{}),
   161  		configHashMetric: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
   162  			Name: "alertmanager_config_hash",
   163  			Help: "Hash of the currently loaded alertmanager configuration.",
   164  		}),
   165  
   166  		rateLimitedNotifications: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   167  			Name: "alertmanager_notification_rate_limited_total",
   168  			Help: "Number of rate-limited notifications per integration.",
   169  		}, []string{"integration"}), // "integration" is consistent with other alertmanager metrics.
   170  
   171  	}
   172  
   173  	am.registry = reg
   174  
   175  	// We currently have 3 operational modes:
   176  	// 1) Alertmanager clustering with upstream Gossip
   177  	// 2) Alertmanager sharding and ring-based replication
   178  	// 3) Alertmanager no replication
   179  	// These are covered in order.
   180  	if cfg.Peer != nil {
   181  		level.Debug(am.logger).Log("msg", "starting tenant alertmanager with gossip-based replication")
   182  		am.state = cfg.Peer
   183  	} else if cfg.ShardingEnabled {
   184  		level.Debug(am.logger).Log("msg", "starting tenant alertmanager with ring-based replication")
   185  		state := newReplicatedStates(cfg.UserID, cfg.ReplicationFactor, cfg.Replicator, cfg.Store, am.logger, am.registry)
   186  		am.state = state
   187  		am.persister = newStatePersister(cfg.PersisterConfig, cfg.UserID, state, cfg.Store, am.logger, am.registry)
   188  	} else {
   189  		level.Debug(am.logger).Log("msg", "starting tenant alertmanager without replication")
   190  		am.state = &NilPeer{}
   191  	}
   192  
   193  	am.wg.Add(1)
   194  	var err error
   195  	am.nflog, err = nflog.New(
   196  		nflog.WithRetention(cfg.Retention),
   197  		nflog.WithSnapshot(filepath.Join(cfg.TenantDataDir, notificationLogSnapshot)),
   198  		nflog.WithMaintenance(maintenancePeriod, am.stop, am.wg.Done, nil),
   199  		nflog.WithMetrics(am.registry),
   200  		nflog.WithLogger(log.With(am.logger, "component", "nflog")),
   201  	)
   202  	if err != nil {
   203  		return nil, fmt.Errorf("failed to create notification log: %v", err)
   204  	}
   205  
   206  	c := am.state.AddState("nfl:"+cfg.UserID, am.nflog, am.registry)
   207  	am.nflog.SetBroadcast(c.Broadcast)
   208  
   209  	am.marker = types.NewMarker(am.registry)
   210  
   211  	silencesFile := filepath.Join(cfg.TenantDataDir, silencesSnapshot)
   212  	am.silences, err = silence.New(silence.Options{
   213  		SnapshotFile: silencesFile,
   214  		Retention:    cfg.Retention,
   215  		Logger:       log.With(am.logger, "component", "silences"),
   216  		Metrics:      am.registry,
   217  	})
   218  	if err != nil {
   219  		return nil, fmt.Errorf("failed to create silences: %v", err)
   220  	}
   221  
   222  	c = am.state.AddState("sil:"+cfg.UserID, am.silences, am.registry)
   223  	am.silences.SetBroadcast(c.Broadcast)
   224  
   225  	// State replication needs to be started after the state keys are defined.
   226  	if service, ok := am.state.(services.Service); ok {
   227  		if err := service.StartAsync(context.Background()); err != nil {
   228  			return nil, errors.Wrap(err, "failed to start ring-based replication service")
   229  		}
   230  	}
   231  
   232  	if am.persister != nil {
   233  		if err := am.persister.StartAsync(context.Background()); err != nil {
   234  			return nil, errors.Wrap(err, "failed to start state persister service")
   235  		}
   236  	}
   237  
   238  	am.pipelineBuilder = notify.NewPipelineBuilder(am.registry)
   239  
   240  	am.wg.Add(1)
   241  	go func() {
   242  		am.silences.Maintenance(maintenancePeriod, silencesFile, am.stop, nil)
   243  		am.wg.Done()
   244  	}()
   245  
   246  	var callback mem.AlertStoreCallback
   247  	if am.cfg.Limits != nil {
   248  		callback = newAlertsLimiter(am.cfg.UserID, am.cfg.Limits, reg)
   249  	}
   250  
   251  	am.alerts, err = mem.NewAlerts(context.Background(), am.marker, 30*time.Minute, callback, am.logger)
   252  	if err != nil {
   253  		return nil, fmt.Errorf("failed to create alerts: %v", err)
   254  	}
   255  
   256  	am.api, err = api.New(api.Options{
   257  		Alerts:     am.alerts,
   258  		Silences:   am.silences,
   259  		StatusFunc: am.marker.Status,
   260  		// Cortex should not expose cluster information back to its tenants.
   261  		Peer:     &NilPeer{},
   262  		Registry: am.registry,
   263  		Logger:   log.With(am.logger, "component", "api"),
   264  		GroupFunc: func(f1 func(*dispatch.Route) bool, f2 func(*types.Alert, time.Time) bool) (dispatch.AlertGroups, map[model.Fingerprint][]string) {
   265  			return am.dispatcher.Groups(f1, f2)
   266  		},
   267  	})
   268  	if err != nil {
   269  		return nil, fmt.Errorf("failed to create api: %v", err)
   270  	}
   271  
   272  	router := route.New().WithPrefix(am.cfg.ExternalURL.Path)
   273  
   274  	ui.Register(router, webReload, log.With(am.logger, "component", "ui"))
   275  	am.mux = am.api.Register(router, am.cfg.ExternalURL.Path)
   276  
   277  	// Override some extra paths registered in the router (eg. /metrics which by default exposes prometheus.DefaultRegisterer).
   278  	// Entire router is registered in Mux to "/" path, so there is no conflict with overwriting specific paths.
   279  	for _, p := range []string{"/metrics", "/-/reload", "/debug/"} {
   280  		a := path.Join(am.cfg.ExternalURL.Path, p)
   281  		// Preserve end slash, as for Mux it means entire subtree.
   282  		if strings.HasSuffix(p, "/") {
   283  			a = a + "/"
   284  		}
   285  		am.mux.Handle(a, http.NotFoundHandler())
   286  	}
   287  
   288  	am.dispatcherMetrics = dispatch.NewDispatcherMetrics(true, am.registry)
   289  
   290  	//TODO: From this point onward, the alertmanager _might_ receive requests - we need to make sure we've settled and are ready.
   291  	return am, nil
   292  }
   293  
   294  func (am *Alertmanager) WaitInitialStateSync(ctx context.Context) error {
   295  	if service, ok := am.state.(services.Service); ok {
   296  		if err := service.AwaitRunning(ctx); err != nil {
   297  			return errors.Wrap(err, "failed to wait for ring-based replication service")
   298  		}
   299  	}
   300  	return nil
   301  }
   302  
   303  // clusterWait returns a function that inspects the current peer state and returns
   304  // a duration of one base timeout for each peer with a higher ID than ourselves.
   305  func clusterWait(position func() int, timeout time.Duration) func() time.Duration {
   306  	return func() time.Duration {
   307  		return time.Duration(position()) * timeout
   308  	}
   309  }
   310  
   311  // ApplyConfig applies a new configuration to an Alertmanager.
   312  func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config, rawCfg string) error {
   313  	templateFiles := make([]string, len(conf.Templates))
   314  	for i, t := range conf.Templates {
   315  		templateFilepath, err := safeTemplateFilepath(filepath.Join(am.cfg.TenantDataDir, templatesDir), t)
   316  		if err != nil {
   317  			return err
   318  		}
   319  
   320  		templateFiles[i] = templateFilepath
   321  	}
   322  
   323  	tmpl, err := template.FromGlobs(templateFiles...)
   324  	if err != nil {
   325  		return err
   326  	}
   327  	tmpl.ExternalURL = am.cfg.ExternalURL
   328  
   329  	am.api.Update(conf, func(_ model.LabelSet) {})
   330  
   331  	// Ensure inhibitor is set before being called
   332  	if am.inhibitor != nil {
   333  		am.inhibitor.Stop()
   334  	}
   335  
   336  	// Ensure dispatcher is set before being called
   337  	if am.dispatcher != nil {
   338  		am.dispatcher.Stop()
   339  	}
   340  
   341  	am.inhibitor = inhibit.NewInhibitor(am.alerts, conf.InhibitRules, am.marker, log.With(am.logger, "component", "inhibitor"))
   342  
   343  	waitFunc := clusterWait(am.state.Position, am.cfg.PeerTimeout)
   344  
   345  	timeoutFunc := func(d time.Duration) time.Duration {
   346  		if d < notify.MinTimeout {
   347  			d = notify.MinTimeout
   348  		}
   349  		return d + waitFunc()
   350  	}
   351  
   352  	// Create a firewall binded to the per-tenant config.
   353  	firewallDialer := util_net.NewFirewallDialer(newFirewallDialerConfigProvider(userID, am.cfg.Limits))
   354  
   355  	integrationsMap, err := buildIntegrationsMap(conf.Receivers, tmpl, firewallDialer, am.logger, func(integrationName string, notifier notify.Notifier) notify.Notifier {
   356  		if am.cfg.Limits != nil {
   357  			rl := &tenantRateLimits{
   358  				tenant:      userID,
   359  				limits:      am.cfg.Limits,
   360  				integration: integrationName,
   361  			}
   362  
   363  			return newRateLimitedNotifier(notifier, rl, 10*time.Second, am.rateLimitedNotifications.WithLabelValues(integrationName))
   364  		}
   365  		return notifier
   366  	})
   367  	if err != nil {
   368  		return nil
   369  	}
   370  
   371  	muteTimes := make(map[string][]timeinterval.TimeInterval, len(conf.MuteTimeIntervals))
   372  	for _, ti := range conf.MuteTimeIntervals {
   373  		muteTimes[ti.Name] = ti.TimeIntervals
   374  	}
   375  
   376  	pipeline := am.pipelineBuilder.New(
   377  		integrationsMap,
   378  		waitFunc,
   379  		am.inhibitor,
   380  		silence.NewSilencer(am.silences, am.marker, am.logger),
   381  		muteTimes,
   382  		am.nflog,
   383  		am.state,
   384  	)
   385  	am.lastPipeline = pipeline
   386  	am.dispatcher = dispatch.NewDispatcher(
   387  		am.alerts,
   388  		dispatch.NewRoute(conf.Route, nil),
   389  		pipeline,
   390  		am.marker,
   391  		timeoutFunc,
   392  		&dispatcherLimits{tenant: am.cfg.UserID, limits: am.cfg.Limits},
   393  		log.With(am.logger, "component", "dispatcher"),
   394  		am.dispatcherMetrics,
   395  	)
   396  
   397  	go am.dispatcher.Run()
   398  	go am.inhibitor.Run()
   399  
   400  	am.configHashMetric.Set(md5HashAsMetricValue([]byte(rawCfg)))
   401  	return nil
   402  }
   403  
   404  // Stop stops the Alertmanager.
   405  func (am *Alertmanager) Stop() {
   406  	if am.inhibitor != nil {
   407  		am.inhibitor.Stop()
   408  	}
   409  
   410  	if am.dispatcher != nil {
   411  		am.dispatcher.Stop()
   412  	}
   413  
   414  	if am.persister != nil {
   415  		am.persister.StopAsync()
   416  	}
   417  
   418  	if service, ok := am.state.(services.Service); ok {
   419  		service.StopAsync()
   420  	}
   421  
   422  	am.alerts.Close()
   423  	close(am.stop)
   424  }
   425  
   426  func (am *Alertmanager) StopAndWait() {
   427  	am.Stop()
   428  
   429  	if am.persister != nil {
   430  		if err := am.persister.AwaitTerminated(context.Background()); err != nil {
   431  			level.Warn(am.logger).Log("msg", "error while stopping state persister service", "err", err)
   432  		}
   433  	}
   434  
   435  	if service, ok := am.state.(services.Service); ok {
   436  		if err := service.AwaitTerminated(context.Background()); err != nil {
   437  			level.Warn(am.logger).Log("msg", "error while stopping ring-based replication service", "err", err)
   438  		}
   439  	}
   440  
   441  	am.wg.Wait()
   442  }
   443  
   444  func (am *Alertmanager) mergePartialExternalState(part *clusterpb.Part) error {
   445  	if state, ok := am.state.(*state); ok {
   446  		return state.MergePartialState(part)
   447  	}
   448  	return errors.New("ring-based sharding not enabled")
   449  }
   450  
   451  func (am *Alertmanager) getFullState() (*clusterpb.FullState, error) {
   452  	if state, ok := am.state.(*state); ok {
   453  		return state.GetFullState()
   454  	}
   455  	return nil, errors.New("ring-based sharding not enabled")
   456  }
   457  
   458  // buildIntegrationsMap builds a map of name to the list of integration notifiers off of a
   459  // list of receiver config.
   460  func buildIntegrationsMap(nc []*config.Receiver, tmpl *template.Template, firewallDialer *util_net.FirewallDialer, logger log.Logger, notifierWrapper func(string, notify.Notifier) notify.Notifier) (map[string][]notify.Integration, error) {
   461  	integrationsMap := make(map[string][]notify.Integration, len(nc))
   462  	for _, rcv := range nc {
   463  		integrations, err := buildReceiverIntegrations(rcv, tmpl, firewallDialer, logger, notifierWrapper)
   464  		if err != nil {
   465  			return nil, err
   466  		}
   467  		integrationsMap[rcv.Name] = integrations
   468  	}
   469  	return integrationsMap, nil
   470  }
   471  
   472  // buildReceiverIntegrations builds a list of integration notifiers off of a
   473  // receiver config.
   474  // Taken from https://github.com/prometheus/alertmanager/blob/94d875f1227b29abece661db1a68c001122d1da5/cmd/alertmanager/main.go#L112-L159.
   475  func buildReceiverIntegrations(nc *config.Receiver, tmpl *template.Template, firewallDialer *util_net.FirewallDialer, logger log.Logger, wrapper func(string, notify.Notifier) notify.Notifier) ([]notify.Integration, error) {
   476  	var (
   477  		errs         types.MultiError
   478  		integrations []notify.Integration
   479  		add          = func(name string, i int, rs notify.ResolvedSender, f func(l log.Logger) (notify.Notifier, error)) {
   480  			n, err := f(log.With(logger, "integration", name))
   481  			if err != nil {
   482  				errs.Add(err)
   483  				return
   484  			}
   485  			n = wrapper(name, n)
   486  			integrations = append(integrations, notify.NewIntegration(n, rs, name, i))
   487  		}
   488  	)
   489  
   490  	// Inject the firewall to any receiver integration supporting it.
   491  	httpOps := []commoncfg.HTTPClientOption{
   492  		commoncfg.WithDialContextFunc(firewallDialer.DialContext),
   493  	}
   494  
   495  	for i, c := range nc.WebhookConfigs {
   496  		add("webhook", i, c, func(l log.Logger) (notify.Notifier, error) { return webhook.New(c, tmpl, l, httpOps...) })
   497  	}
   498  	for i, c := range nc.EmailConfigs {
   499  		add("email", i, c, func(l log.Logger) (notify.Notifier, error) { return email.New(c, tmpl, l), nil })
   500  	}
   501  	for i, c := range nc.PagerdutyConfigs {
   502  		add("pagerduty", i, c, func(l log.Logger) (notify.Notifier, error) { return pagerduty.New(c, tmpl, l, httpOps...) })
   503  	}
   504  	for i, c := range nc.OpsGenieConfigs {
   505  		add("opsgenie", i, c, func(l log.Logger) (notify.Notifier, error) { return opsgenie.New(c, tmpl, l, httpOps...) })
   506  	}
   507  	for i, c := range nc.WechatConfigs {
   508  		add("wechat", i, c, func(l log.Logger) (notify.Notifier, error) { return wechat.New(c, tmpl, l, httpOps...) })
   509  	}
   510  	for i, c := range nc.SlackConfigs {
   511  		add("slack", i, c, func(l log.Logger) (notify.Notifier, error) { return slack.New(c, tmpl, l, httpOps...) })
   512  	}
   513  	for i, c := range nc.VictorOpsConfigs {
   514  		add("victorops", i, c, func(l log.Logger) (notify.Notifier, error) { return victorops.New(c, tmpl, l, httpOps...) })
   515  	}
   516  	for i, c := range nc.PushoverConfigs {
   517  		add("pushover", i, c, func(l log.Logger) (notify.Notifier, error) { return pushover.New(c, tmpl, l, httpOps...) })
   518  	}
   519  	for i, c := range nc.SNSConfigs {
   520  		add("sns", i, c, func(l log.Logger) (notify.Notifier, error) { return sns.New(c, tmpl, l, httpOps...) })
   521  	}
   522  	// If we add support for more integrations, we need to add them to validation as well. See validation.allowedIntegrationNames field.
   523  	if errs.Len() > 0 {
   524  		return nil, &errs
   525  	}
   526  	return integrations, nil
   527  }
   528  
   529  func md5HashAsMetricValue(data []byte) float64 {
   530  	sum := md5.Sum(data)
   531  	// We only want 48 bits as a float64 only has a 53 bit mantissa.
   532  	smallSum := sum[0:6]
   533  	var bytes = make([]byte, 8)
   534  	copy(bytes, smallSum)
   535  	return float64(binary.LittleEndian.Uint64(bytes))
   536  }
   537  
   538  // NilPeer and NilChannel implements the Alertmanager clustering interface used by the API to expose cluster information.
   539  // In a multi-tenant environment, we choose not to expose these to tenants and thus are not implemented.
   540  type NilPeer struct{}
   541  
   542  func (p *NilPeer) Name() string                    { return "" }
   543  func (p *NilPeer) Status() string                  { return "ready" }
   544  func (p *NilPeer) Peers() []cluster.ClusterMember  { return nil }
   545  func (p *NilPeer) Position() int                   { return 0 }
   546  func (p *NilPeer) WaitReady(context.Context) error { return nil }
   547  func (p *NilPeer) AddState(string, cluster.State, prometheus.Registerer) cluster.ClusterChannel {
   548  	return &NilChannel{}
   549  }
   550  
   551  type NilChannel struct{}
   552  
   553  func (c *NilChannel) Broadcast([]byte) {}
   554  
   555  type firewallDialerConfigProvider struct {
   556  	userID string
   557  	limits Limits
   558  }
   559  
   560  func newFirewallDialerConfigProvider(userID string, limits Limits) firewallDialerConfigProvider {
   561  	return firewallDialerConfigProvider{
   562  		userID: userID,
   563  		limits: limits,
   564  	}
   565  }
   566  
   567  func (p firewallDialerConfigProvider) BlockCIDRNetworks() []flagext.CIDR {
   568  	return p.limits.AlertmanagerReceiversBlockCIDRNetworks(p.userID)
   569  }
   570  
   571  func (p firewallDialerConfigProvider) BlockPrivateAddresses() bool {
   572  	return p.limits.AlertmanagerReceiversBlockPrivateAddresses(p.userID)
   573  }
   574  
   575  type tenantRateLimits struct {
   576  	tenant      string
   577  	integration string
   578  	limits      Limits
   579  }
   580  
   581  func (t *tenantRateLimits) RateLimit() rate.Limit {
   582  	return t.limits.NotificationRateLimit(t.tenant, t.integration)
   583  }
   584  
   585  func (t *tenantRateLimits) Burst() int {
   586  	return t.limits.NotificationBurstSize(t.tenant, t.integration)
   587  }
   588  
   589  type dispatcherLimits struct {
   590  	tenant string
   591  	limits Limits
   592  }
   593  
   594  func (g *dispatcherLimits) MaxNumberOfAggregationGroups() int {
   595  	return g.limits.AlertmanagerMaxDispatcherAggregationGroups(g.tenant)
   596  }
   597  
   598  var (
   599  	errTooManyAlerts = "too many alerts, limit: %d"
   600  	errAlertsTooBig  = "alerts too big, total size limit: %d bytes"
   601  )
   602  
   603  // alertsLimiter limits the number and size of alerts being received by the Alertmanager.
   604  // We consider an alert unique based on its fingerprint (a hash of its labels) and
   605  // its size it's determined by the sum of bytes of its labels, annotations, and generator URL.
   606  type alertsLimiter struct {
   607  	tenant string
   608  	limits Limits
   609  
   610  	failureCounter prometheus.Counter
   611  
   612  	mx        sync.Mutex
   613  	sizes     map[model.Fingerprint]int
   614  	count     int
   615  	totalSize int
   616  }
   617  
   618  func newAlertsLimiter(tenant string, limits Limits, reg prometheus.Registerer) *alertsLimiter {
   619  	limiter := &alertsLimiter{
   620  		tenant: tenant,
   621  		limits: limits,
   622  		sizes:  map[model.Fingerprint]int{},
   623  		failureCounter: promauto.With(reg).NewCounter(prometheus.CounterOpts{
   624  			Name: "alertmanager_alerts_insert_limited_total",
   625  			Help: "Number of failures to insert new alerts to in-memory alert store.",
   626  		}),
   627  	}
   628  
   629  	promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{
   630  		Name: "alertmanager_alerts_limiter_current_alerts",
   631  		Help: "Number of alerts tracked by alerts limiter.",
   632  	}, func() float64 {
   633  		c, _ := limiter.currentStats()
   634  		return float64(c)
   635  	})
   636  
   637  	promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{
   638  		Name: "alertmanager_alerts_limiter_current_alerts_size_bytes",
   639  		Help: "Total size of alerts tracked by alerts limiter.",
   640  	}, func() float64 {
   641  		_, s := limiter.currentStats()
   642  		return float64(s)
   643  	})
   644  
   645  	return limiter
   646  }
   647  
   648  func (a *alertsLimiter) PreStore(alert *types.Alert, existing bool) error {
   649  	if alert == nil {
   650  		return nil
   651  	}
   652  
   653  	fp := alert.Fingerprint()
   654  
   655  	countLimit := a.limits.AlertmanagerMaxAlertsCount(a.tenant)
   656  	sizeLimit := a.limits.AlertmanagerMaxAlertsSizeBytes(a.tenant)
   657  
   658  	sizeDiff := alertSize(alert.Alert)
   659  
   660  	a.mx.Lock()
   661  	defer a.mx.Unlock()
   662  
   663  	if !existing && countLimit > 0 && (a.count+1) > countLimit {
   664  		a.failureCounter.Inc()
   665  		return fmt.Errorf(errTooManyAlerts, countLimit)
   666  	}
   667  
   668  	if existing {
   669  		sizeDiff -= a.sizes[fp]
   670  	}
   671  
   672  	if sizeLimit > 0 && (a.totalSize+sizeDiff) > sizeLimit {
   673  		a.failureCounter.Inc()
   674  		return fmt.Errorf(errAlertsTooBig, sizeLimit)
   675  	}
   676  
   677  	return nil
   678  }
   679  
   680  func (a *alertsLimiter) PostStore(alert *types.Alert, existing bool) {
   681  	if alert == nil {
   682  		return
   683  	}
   684  
   685  	newSize := alertSize(alert.Alert)
   686  	fp := alert.Fingerprint()
   687  
   688  	a.mx.Lock()
   689  	defer a.mx.Unlock()
   690  
   691  	if existing {
   692  		a.totalSize -= a.sizes[fp]
   693  	} else {
   694  		a.count++
   695  	}
   696  	a.sizes[fp] = newSize
   697  	a.totalSize += newSize
   698  }
   699  
   700  func (a *alertsLimiter) PostDelete(alert *types.Alert) {
   701  	if alert == nil {
   702  		return
   703  	}
   704  
   705  	fp := alert.Fingerprint()
   706  
   707  	a.mx.Lock()
   708  	defer a.mx.Unlock()
   709  
   710  	a.totalSize -= a.sizes[fp]
   711  	delete(a.sizes, fp)
   712  	a.count--
   713  }
   714  
   715  func (a *alertsLimiter) currentStats() (count, totalSize int) {
   716  	a.mx.Lock()
   717  	defer a.mx.Unlock()
   718  
   719  	return a.count, a.totalSize
   720  }
   721  
   722  func alertSize(alert model.Alert) int {
   723  	size := 0
   724  	for l, v := range alert.Labels {
   725  		size += len(l)
   726  		size += len(v)
   727  	}
   728  	for l, v := range alert.Annotations {
   729  		size += len(l)
   730  		size += len(v)
   731  	}
   732  	size += len(alert.GeneratorURL)
   733  	return size
   734  }