github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/alertmanager/multitenant.go (about)

     1  package alertmanager
     2  
     3  import (
     4  	"context"
     5  	"flag"
     6  	"fmt"
     7  	"io/ioutil"
     8  	"net/http"
     9  	"net/url"
    10  	"os"
    11  	"path/filepath"
    12  	"strings"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/go-kit/log"
    17  	"github.com/go-kit/log/level"
    18  	"github.com/grafana/dskit/concurrency"
    19  	"github.com/grafana/dskit/flagext"
    20  	"github.com/grafana/dskit/kv"
    21  	"github.com/grafana/dskit/ring"
    22  	"github.com/grafana/dskit/ring/client"
    23  	"github.com/grafana/dskit/services"
    24  	"github.com/pkg/errors"
    25  	"github.com/prometheus/alertmanager/cluster"
    26  	"github.com/prometheus/alertmanager/cluster/clusterpb"
    27  	amconfig "github.com/prometheus/alertmanager/config"
    28  	"github.com/prometheus/client_golang/prometheus"
    29  	"github.com/prometheus/client_golang/prometheus/promauto"
    30  	tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
    31  	"github.com/weaveworks/common/httpgrpc"
    32  	"github.com/weaveworks/common/httpgrpc/server"
    33  	"github.com/weaveworks/common/user"
    34  	"golang.org/x/time/rate"
    35  
    36  	"github.com/cortexproject/cortex/pkg/alertmanager/alertmanagerpb"
    37  	"github.com/cortexproject/cortex/pkg/alertmanager/alertspb"
    38  	"github.com/cortexproject/cortex/pkg/alertmanager/alertstore"
    39  	"github.com/cortexproject/cortex/pkg/tenant"
    40  	"github.com/cortexproject/cortex/pkg/util"
    41  	util_log "github.com/cortexproject/cortex/pkg/util/log"
    42  )
    43  
    44  const (
    45  	// If a config sets the webhook URL to this, it will be rewritten to
    46  	// a URL derived from Config.AutoWebhookRoot
    47  	autoWebhookURL = "http://internal.monitor"
    48  
    49  	// Reasons for (re)syncing alertmanager configurations from object storage.
    50  	reasonPeriodic   = "periodic"
    51  	reasonInitial    = "initial"
    52  	reasonRingChange = "ring-change"
    53  
    54  	// ringAutoForgetUnhealthyPeriods is how many consecutive timeout periods an unhealthy instance
    55  	// in the ring will be automatically removed.
    56  	ringAutoForgetUnhealthyPeriods = 5
    57  )
    58  
    59  var (
    60  	errInvalidExternalURL                  = errors.New("the configured external URL is invalid: should not end with /")
    61  	errShardingLegacyStorage               = errors.New("deprecated -alertmanager.storage.* not supported with -alertmanager.sharding-enabled, use -alertmanager-storage.*")
    62  	errShardingUnsupportedStorage          = errors.New("the configured alertmanager storage backend is not supported when sharding is enabled")
    63  	errZoneAwarenessEnabledWithoutZoneInfo = errors.New("the configured alertmanager has zone awareness enabled but zone is not set")
    64  )
    65  
    66  // MultitenantAlertmanagerConfig is the configuration for a multitenant Alertmanager.
    67  type MultitenantAlertmanagerConfig struct {
    68  	DataDir        string           `yaml:"data_dir"`
    69  	Retention      time.Duration    `yaml:"retention"`
    70  	ExternalURL    flagext.URLValue `yaml:"external_url"`
    71  	PollInterval   time.Duration    `yaml:"poll_interval"`
    72  	MaxRecvMsgSize int64            `yaml:"max_recv_msg_size"`
    73  
    74  	// Enable sharding for the Alertmanager
    75  	ShardingEnabled bool       `yaml:"sharding_enabled"`
    76  	ShardingRing    RingConfig `yaml:"sharding_ring"`
    77  
    78  	FallbackConfigFile string `yaml:"fallback_config_file"`
    79  	AutoWebhookRoot    string `yaml:"auto_webhook_root"`
    80  
    81  	Store   alertstore.LegacyConfig `yaml:"storage" doc:"description=Deprecated. Use -alertmanager-storage.* CLI flags and their respective YAML config options instead."`
    82  	Cluster ClusterConfig           `yaml:"cluster"`
    83  
    84  	EnableAPI bool `yaml:"enable_api"`
    85  
    86  	// For distributor.
    87  	AlertmanagerClient ClientConfig `yaml:"alertmanager_client"`
    88  
    89  	// For the state persister.
    90  	Persister PersisterConfig `yaml:",inline"`
    91  }
    92  
    93  type ClusterConfig struct {
    94  	ListenAddr       string                 `yaml:"listen_address"`
    95  	AdvertiseAddr    string                 `yaml:"advertise_address"`
    96  	Peers            flagext.StringSliceCSV `yaml:"peers"`
    97  	PeerTimeout      time.Duration          `yaml:"peer_timeout"`
    98  	GossipInterval   time.Duration          `yaml:"gossip_interval"`
    99  	PushPullInterval time.Duration          `yaml:"push_pull_interval"`
   100  }
   101  
   102  const (
   103  	defaultClusterAddr = "0.0.0.0:9094"
   104  	defaultPeerTimeout = 15 * time.Second
   105  )
   106  
   107  // RegisterFlags adds the flags required to config this to the given FlagSet.
   108  func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) {
   109  	f.StringVar(&cfg.DataDir, "alertmanager.storage.path", "data/", "Base path for data storage.")
   110  	f.DurationVar(&cfg.Retention, "alertmanager.storage.retention", 5*24*time.Hour, "How long to keep data for.")
   111  	f.Int64Var(&cfg.MaxRecvMsgSize, "alertmanager.max-recv-msg-size", 16<<20, "Maximum size (bytes) of an accepted HTTP request body.")
   112  
   113  	f.Var(&cfg.ExternalURL, "alertmanager.web.external-url", "The URL under which Alertmanager is externally reachable (for example, if Alertmanager is served via a reverse proxy). Used for generating relative and absolute links back to Alertmanager itself. If the URL has a path portion, it will be used to prefix all HTTP endpoints served by Alertmanager. If omitted, relevant URL components will be derived automatically.")
   114  
   115  	f.StringVar(&cfg.FallbackConfigFile, "alertmanager.configs.fallback", "", "Filename of fallback config to use if none specified for instance.")
   116  	f.StringVar(&cfg.AutoWebhookRoot, "alertmanager.configs.auto-webhook-root", "", "Root of URL to generate if config is "+autoWebhookURL)
   117  	f.DurationVar(&cfg.PollInterval, "alertmanager.configs.poll-interval", 15*time.Second, "How frequently to poll Cortex configs")
   118  
   119  	f.BoolVar(&cfg.EnableAPI, "experimental.alertmanager.enable-api", false, "Enable the experimental alertmanager config api.")
   120  
   121  	f.BoolVar(&cfg.ShardingEnabled, "alertmanager.sharding-enabled", false, "Shard tenants across multiple alertmanager instances.")
   122  
   123  	cfg.AlertmanagerClient.RegisterFlagsWithPrefix("alertmanager.alertmanager-client", f)
   124  	cfg.Persister.RegisterFlagsWithPrefix("alertmanager", f)
   125  	cfg.ShardingRing.RegisterFlags(f)
   126  	cfg.Store.RegisterFlags(f)
   127  	cfg.Cluster.RegisterFlags(f)
   128  }
   129  
   130  func (cfg *ClusterConfig) RegisterFlags(f *flag.FlagSet) {
   131  	prefix := "alertmanager.cluster."
   132  	f.StringVar(&cfg.ListenAddr, prefix+"listen-address", defaultClusterAddr, "Listen address and port for the cluster. Not specifying this flag disables high-availability mode.")
   133  	f.StringVar(&cfg.AdvertiseAddr, prefix+"advertise-address", "", "Explicit address or hostname to advertise in cluster.")
   134  	f.Var(&cfg.Peers, prefix+"peers", "Comma-separated list of initial peers.")
   135  	f.DurationVar(&cfg.PeerTimeout, prefix+"peer-timeout", defaultPeerTimeout, "Time to wait between peers to send notifications.")
   136  	f.DurationVar(&cfg.GossipInterval, prefix+"gossip-interval", cluster.DefaultGossipInterval, "The interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across cluster more quickly at the expense of increased bandwidth usage.")
   137  	f.DurationVar(&cfg.PushPullInterval, prefix+"push-pull-interval", cluster.DefaultPushPullInterval, "The interval between gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.")
   138  }
   139  
   140  // Validate config and returns error on failure
   141  func (cfg *MultitenantAlertmanagerConfig) Validate(storageCfg alertstore.Config) error {
   142  	if cfg.ExternalURL.URL != nil && strings.HasSuffix(cfg.ExternalURL.Path, "/") {
   143  		return errInvalidExternalURL
   144  	}
   145  
   146  	if err := cfg.Store.Validate(); err != nil {
   147  		return errors.Wrap(err, "invalid storage config")
   148  	}
   149  
   150  	if err := cfg.Persister.Validate(); err != nil {
   151  		return err
   152  	}
   153  
   154  	if cfg.ShardingEnabled {
   155  		if !cfg.Store.IsDefaults() {
   156  			return errShardingLegacyStorage
   157  		}
   158  		if !storageCfg.IsFullStateSupported() {
   159  			return errShardingUnsupportedStorage
   160  		}
   161  		if cfg.ShardingRing.ZoneAwarenessEnabled && cfg.ShardingRing.InstanceZone == "" {
   162  			return errZoneAwarenessEnabledWithoutZoneInfo
   163  		}
   164  	}
   165  
   166  	return nil
   167  }
   168  
   169  type multitenantAlertmanagerMetrics struct {
   170  	lastReloadSuccessful          *prometheus.GaugeVec
   171  	lastReloadSuccessfulTimestamp *prometheus.GaugeVec
   172  }
   173  
   174  func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics {
   175  	m := &multitenantAlertmanagerMetrics{}
   176  
   177  	m.lastReloadSuccessful = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
   178  		Namespace: "cortex",
   179  		Name:      "alertmanager_config_last_reload_successful",
   180  		Help:      "Boolean set to 1 whenever the last configuration reload attempt was successful.",
   181  	}, []string{"user"})
   182  
   183  	m.lastReloadSuccessfulTimestamp = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
   184  		Namespace: "cortex",
   185  		Name:      "alertmanager_config_last_reload_successful_seconds",
   186  		Help:      "Timestamp of the last successful configuration reload.",
   187  	}, []string{"user"})
   188  
   189  	return m
   190  }
   191  
   192  // Limits defines limits used by Alertmanager.
   193  type Limits interface {
   194  	// AlertmanagerReceiversBlockCIDRNetworks returns the list of network CIDRs that should be blocked
   195  	// in the Alertmanager receivers for the given user.
   196  	AlertmanagerReceiversBlockCIDRNetworks(user string) []flagext.CIDR
   197  
   198  	// AlertmanagerReceiversBlockPrivateAddresses returns true if private addresses should be blocked
   199  	// in the Alertmanager receivers for the given user.
   200  	AlertmanagerReceiversBlockPrivateAddresses(user string) bool
   201  
   202  	// NotificationRateLimit methods return limit used by rate-limiter for given integration.
   203  	// If set to 0, no notifications are allowed.
   204  	// rate.Inf = all notifications are allowed.
   205  	//
   206  	// Note that when negative or zero values specified by user are translated to rate.Limit by Overrides,
   207  	// and may have different meaning there.
   208  	NotificationRateLimit(tenant string, integration string) rate.Limit
   209  
   210  	// NotificationBurstSize returns burst-size for rate limiter for given integration type. If 0, no notifications are allowed except
   211  	// when limit == rate.Inf.
   212  	NotificationBurstSize(tenant string, integration string) int
   213  
   214  	// AlertmanagerMaxConfigSize returns max size of configuration file that user is allowed to upload. If 0, there is no limit.
   215  	AlertmanagerMaxConfigSize(tenant string) int
   216  
   217  	// AlertmanagerMaxTemplatesCount returns max number of templates that tenant can use in the configuration. 0 = no limit.
   218  	AlertmanagerMaxTemplatesCount(tenant string) int
   219  
   220  	// AlertmanagerMaxTemplateSize returns max size of individual template. 0 = no limit.
   221  	AlertmanagerMaxTemplateSize(tenant string) int
   222  
   223  	// AlertmanagerMaxDispatcherAggregationGroups returns maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have.
   224  	// Each aggregation group consumes single goroutine. 0 = unlimited.
   225  	AlertmanagerMaxDispatcherAggregationGroups(t string) int
   226  
   227  	// AlertmanagerMaxAlertsCount returns max number of alerts that tenant can have active at the same time. 0 = no limit.
   228  	AlertmanagerMaxAlertsCount(tenant string) int
   229  
   230  	// AlertmanagerMaxAlertsSizeBytes returns total max size of alerts that tenant can have active at the same time. 0 = no limit.
   231  	// Size of the alert is computed from alert labels, annotations and generator URL.
   232  	AlertmanagerMaxAlertsSizeBytes(tenant string) int
   233  }
   234  
   235  // A MultitenantAlertmanager manages Alertmanager instances for multiple
   236  // organizations.
   237  type MultitenantAlertmanager struct {
   238  	services.Service
   239  
   240  	cfg *MultitenantAlertmanagerConfig
   241  
   242  	// Ring used for sharding alertmanager instances.
   243  	// When sharding is disabled, the flow is:
   244  	//   ServeHTTP() -> serveRequest()
   245  	// When sharding is enabled:
   246  	//   ServeHTTP() -> distributor.DistributeRequest() -> (sends to other AM or even the current)
   247  	//     -> HandleRequest() (gRPC call) -> grpcServer() -> handlerForGRPCServer.ServeHTTP() -> serveRequest().
   248  	ringLifecycler *ring.BasicLifecycler
   249  	ring           *ring.Ring
   250  	distributor    *Distributor
   251  	grpcServer     *server.Server
   252  
   253  	// Last ring state. This variable is not protected with a mutex because it's always
   254  	// accessed by a single goroutine at a time.
   255  	ringLastState ring.ReplicationSet
   256  
   257  	// Subservices manager (ring, lifecycler)
   258  	subservices        *services.Manager
   259  	subservicesWatcher *services.FailureWatcher
   260  
   261  	store alertstore.AlertStore
   262  
   263  	// The fallback config is stored as a string and parsed every time it's needed
   264  	// because we mutate the parsed results and don't want those changes to take
   265  	// effect here.
   266  	fallbackConfig string
   267  
   268  	alertmanagersMtx sync.Mutex
   269  	alertmanagers    map[string]*Alertmanager
   270  	// Stores the current set of configurations we're running in each tenant's Alertmanager.
   271  	// Used for comparing configurations as we synchronize them.
   272  	cfgs map[string]alertspb.AlertConfigDesc
   273  
   274  	logger              log.Logger
   275  	alertmanagerMetrics *alertmanagerMetrics
   276  	multitenantMetrics  *multitenantAlertmanagerMetrics
   277  
   278  	peer                    *cluster.Peer
   279  	alertmanagerClientsPool ClientsPool
   280  
   281  	limits Limits
   282  
   283  	registry          prometheus.Registerer
   284  	ringCheckErrors   prometheus.Counter
   285  	tenantsOwned      prometheus.Gauge
   286  	tenantsDiscovered prometheus.Gauge
   287  	syncTotal         *prometheus.CounterVec
   288  	syncFailures      *prometheus.CounterVec
   289  }
   290  
   291  // NewMultitenantAlertmanager creates a new MultitenantAlertmanager.
   292  func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, store alertstore.AlertStore, limits Limits, logger log.Logger, registerer prometheus.Registerer) (*MultitenantAlertmanager, error) {
   293  	err := os.MkdirAll(cfg.DataDir, 0777)
   294  	if err != nil {
   295  		return nil, fmt.Errorf("unable to create Alertmanager data directory %q: %s", cfg.DataDir, err)
   296  	}
   297  
   298  	if cfg.ExternalURL.URL == nil {
   299  		return nil, fmt.Errorf("unable to create Alertmanager because the external URL has not been configured")
   300  	}
   301  
   302  	var fallbackConfig []byte
   303  	if cfg.FallbackConfigFile != "" {
   304  		fallbackConfig, err = ioutil.ReadFile(cfg.FallbackConfigFile)
   305  		if err != nil {
   306  			return nil, fmt.Errorf("unable to read fallback config %q: %s", cfg.FallbackConfigFile, err)
   307  		}
   308  		_, err = amconfig.LoadFile(cfg.FallbackConfigFile)
   309  		if err != nil {
   310  			return nil, fmt.Errorf("unable to load fallback config %q: %s", cfg.FallbackConfigFile, err)
   311  		}
   312  	}
   313  
   314  	var peer *cluster.Peer
   315  	// We need to take this case into account to support our legacy upstream clustering.
   316  	if cfg.Cluster.ListenAddr != "" && !cfg.ShardingEnabled {
   317  		peer, err = cluster.Create(
   318  			log.With(logger, "component", "cluster"),
   319  			registerer,
   320  			cfg.Cluster.ListenAddr,
   321  			cfg.Cluster.AdvertiseAddr,
   322  			cfg.Cluster.Peers,
   323  			true,
   324  			cfg.Cluster.PushPullInterval,
   325  			cfg.Cluster.GossipInterval,
   326  			cluster.DefaultTcpTimeout,
   327  			cluster.DefaultProbeTimeout,
   328  			cluster.DefaultProbeInterval,
   329  			nil,
   330  		)
   331  		if err != nil {
   332  			return nil, errors.Wrap(err, "unable to initialize gossip mesh")
   333  		}
   334  		err = peer.Join(cluster.DefaultReconnectInterval, cluster.DefaultReconnectTimeout)
   335  		if err != nil {
   336  			level.Warn(logger).Log("msg", "unable to join gossip mesh while initializing cluster for high availability mode", "err", err)
   337  		}
   338  		go peer.Settle(context.Background(), cluster.DefaultGossipInterval)
   339  	}
   340  
   341  	var ringStore kv.Client
   342  	if cfg.ShardingEnabled {
   343  		util_log.WarnExperimentalUse("Alertmanager sharding")
   344  
   345  		ringStore, err = kv.NewClient(
   346  			cfg.ShardingRing.KVStore,
   347  			ring.GetCodec(),
   348  			kv.RegistererWithKVName(prometheus.WrapRegistererWithPrefix("cortex_", registerer), "alertmanager"),
   349  			logger,
   350  		)
   351  		if err != nil {
   352  			return nil, errors.Wrap(err, "create KV store client")
   353  		}
   354  	}
   355  
   356  	return createMultitenantAlertmanager(cfg, fallbackConfig, peer, store, ringStore, limits, logger, registerer)
   357  }
   358  
   359  func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackConfig []byte, peer *cluster.Peer, store alertstore.AlertStore, ringStore kv.Client, limits Limits, logger log.Logger, registerer prometheus.Registerer) (*MultitenantAlertmanager, error) {
   360  	am := &MultitenantAlertmanager{
   361  		cfg:                 cfg,
   362  		fallbackConfig:      string(fallbackConfig),
   363  		cfgs:                map[string]alertspb.AlertConfigDesc{},
   364  		alertmanagers:       map[string]*Alertmanager{},
   365  		alertmanagerMetrics: newAlertmanagerMetrics(),
   366  		multitenantMetrics:  newMultitenantAlertmanagerMetrics(registerer),
   367  		peer:                peer,
   368  		store:               store,
   369  		logger:              log.With(logger, "component", "MultiTenantAlertmanager"),
   370  		registry:            registerer,
   371  		limits:              limits,
   372  		ringCheckErrors: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
   373  			Name: "cortex_alertmanager_ring_check_errors_total",
   374  			Help: "Number of errors that have occurred when checking the ring for ownership.",
   375  		}),
   376  		syncTotal: promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
   377  			Name: "cortex_alertmanager_sync_configs_total",
   378  			Help: "Total number of times the alertmanager sync operation triggered.",
   379  		}, []string{"reason"}),
   380  		syncFailures: promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
   381  			Name: "cortex_alertmanager_sync_configs_failed_total",
   382  			Help: "Total number of times the alertmanager sync operation failed.",
   383  		}, []string{"reason"}),
   384  		tenantsDiscovered: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{
   385  			Name: "cortex_alertmanager_tenants_discovered",
   386  			Help: "Number of tenants with an Alertmanager configuration discovered.",
   387  		}),
   388  		tenantsOwned: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{
   389  			Name: "cortex_alertmanager_tenants_owned",
   390  			Help: "Current number of tenants owned by the Alertmanager instance.",
   391  		}),
   392  	}
   393  
   394  	// Initialize the top-level metrics.
   395  	for _, r := range []string{reasonInitial, reasonPeriodic, reasonRingChange} {
   396  		am.syncTotal.WithLabelValues(r)
   397  		am.syncFailures.WithLabelValues(r)
   398  	}
   399  
   400  	if cfg.ShardingEnabled {
   401  		lifecyclerCfg, err := am.cfg.ShardingRing.ToLifecyclerConfig(am.logger)
   402  		if err != nil {
   403  			return nil, errors.Wrap(err, "failed to initialize Alertmanager's lifecycler config")
   404  		}
   405  
   406  		// Define lifecycler delegates in reverse order (last to be called defined first because they're
   407  		// chained via "next delegate").
   408  		delegate := ring.BasicLifecyclerDelegate(am)
   409  		delegate = ring.NewLeaveOnStoppingDelegate(delegate, am.logger)
   410  		delegate = ring.NewAutoForgetDelegate(am.cfg.ShardingRing.HeartbeatTimeout*ringAutoForgetUnhealthyPeriods, delegate, am.logger)
   411  
   412  		am.ringLifecycler, err = ring.NewBasicLifecycler(lifecyclerCfg, RingNameForServer, RingKey, ringStore, delegate, am.logger, prometheus.WrapRegistererWithPrefix("cortex_", am.registry))
   413  		if err != nil {
   414  			return nil, errors.Wrap(err, "failed to initialize Alertmanager's lifecycler")
   415  		}
   416  
   417  		am.ring, err = ring.NewWithStoreClientAndStrategy(am.cfg.ShardingRing.ToRingConfig(), RingNameForServer, RingKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), prometheus.WrapRegistererWithPrefix("cortex_", am.registry), am.logger)
   418  		if err != nil {
   419  			return nil, errors.Wrap(err, "failed to initialize Alertmanager's ring")
   420  		}
   421  
   422  		am.grpcServer = server.NewServer(&handlerForGRPCServer{am: am})
   423  
   424  		am.alertmanagerClientsPool = newAlertmanagerClientsPool(client.NewRingServiceDiscovery(am.ring), cfg.AlertmanagerClient, logger, am.registry)
   425  		am.distributor, err = NewDistributor(cfg.AlertmanagerClient, cfg.MaxRecvMsgSize, am.ring, am.alertmanagerClientsPool, log.With(logger, "component", "AlertmanagerDistributor"), am.registry)
   426  		if err != nil {
   427  			return nil, errors.Wrap(err, "create distributor")
   428  		}
   429  	}
   430  
   431  	if registerer != nil {
   432  		registerer.MustRegister(am.alertmanagerMetrics)
   433  	}
   434  
   435  	am.Service = services.NewBasicService(am.starting, am.run, am.stopping)
   436  
   437  	return am, nil
   438  }
   439  
   440  // handlerForGRPCServer acts as a handler for gRPC server to serve
   441  // the serveRequest() via the standard ServeHTTP.
   442  type handlerForGRPCServer struct {
   443  	am *MultitenantAlertmanager
   444  }
   445  
   446  func (h *handlerForGRPCServer) ServeHTTP(w http.ResponseWriter, req *http.Request) {
   447  	h.am.serveRequest(w, req)
   448  }
   449  
   450  func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) {
   451  	err = am.migrateStateFilesToPerTenantDirectories()
   452  	if err != nil {
   453  		return err
   454  	}
   455  
   456  	defer func() {
   457  		if err == nil || am.subservices == nil {
   458  			return
   459  		}
   460  
   461  		if stopErr := services.StopManagerAndAwaitStopped(context.Background(), am.subservices); stopErr != nil {
   462  			level.Error(am.logger).Log("msg", "failed to gracefully stop alertmanager dependencies", "err", stopErr)
   463  		}
   464  	}()
   465  
   466  	if am.cfg.ShardingEnabled {
   467  		if am.subservices, err = services.NewManager(am.ringLifecycler, am.ring, am.distributor); err != nil {
   468  			return errors.Wrap(err, "failed to start alertmanager's subservices")
   469  		}
   470  
   471  		if err = services.StartManagerAndAwaitHealthy(ctx, am.subservices); err != nil {
   472  			return errors.Wrap(err, "failed to start alertmanager's subservices")
   473  		}
   474  
   475  		am.subservicesWatcher = services.NewFailureWatcher()
   476  		am.subservicesWatcher.WatchManager(am.subservices)
   477  
   478  		// We wait until the instance is in the JOINING state, once it does we know that tokens are assigned to this instance and we'll be ready to perform an initial sync of configs.
   479  		level.Info(am.logger).Log("waiting until alertmanager is JOINING in the ring")
   480  		if err = ring.WaitInstanceState(ctx, am.ring, am.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil {
   481  			return err
   482  		}
   483  		level.Info(am.logger).Log("msg", "alertmanager is JOINING in the ring")
   484  	}
   485  
   486  	// At this point, if sharding is enabled, the instance is registered with some tokens
   487  	// and we can run the initial iteration to sync configs. If no sharding is enabled we load _all_ the configs.
   488  	if err := am.loadAndSyncConfigs(ctx, reasonInitial); err != nil {
   489  		return err
   490  	}
   491  
   492  	if am.cfg.ShardingEnabled {
   493  		// Store the ring state after the initial Alertmanager configs sync has been done and before we do change
   494  		// our state in the ring.
   495  		am.ringLastState, _ = am.ring.GetAllHealthy(RingOp)
   496  
   497  		// Make sure that all the alertmanagers we were initially configured with have
   498  		// fetched state from the replicas, before advertising as ACTIVE. This will
   499  		// reduce the possibility that we lose state when new instances join/leave.
   500  		level.Info(am.logger).Log("msg", "waiting until initial state sync is complete for all users")
   501  		if err := am.waitInitialStateSync(ctx); err != nil {
   502  			return errors.Wrap(err, "failed to wait for initial state sync")
   503  		}
   504  		level.Info(am.logger).Log("msg", "initial state sync is complete")
   505  
   506  		// With the initial sync now completed, we should have loaded all assigned alertmanager configurations to this instance. We can switch it to ACTIVE and start serving requests.
   507  		if err := am.ringLifecycler.ChangeState(ctx, ring.ACTIVE); err != nil {
   508  			return errors.Wrapf(err, "switch instance to %s in the ring", ring.ACTIVE)
   509  		}
   510  
   511  		// Wait until the ring client detected this instance in the ACTIVE state.
   512  		level.Info(am.logger).Log("msg", "waiting until alertmanager is ACTIVE in the ring")
   513  		if err := ring.WaitInstanceState(ctx, am.ring, am.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil {
   514  			return err
   515  		}
   516  		level.Info(am.logger).Log("msg", "alertmanager is ACTIVE in the ring")
   517  	}
   518  
   519  	return nil
   520  }
   521  
   522  // migrateStateFilesToPerTenantDirectories migrates any existing configuration from old place to new hierarchy.
   523  // TODO: Remove in Cortex 1.11.
   524  func (am *MultitenantAlertmanager) migrateStateFilesToPerTenantDirectories() error {
   525  	migrate := func(from, to string) error {
   526  		level.Info(am.logger).Log("msg", "migrating alertmanager state", "from", from, "to", to)
   527  		err := os.Rename(from, to)
   528  		return errors.Wrapf(err, "failed to migrate alertmanager state from %v to %v", from, to)
   529  	}
   530  
   531  	st, err := am.getObsoleteFilesPerUser()
   532  	if err != nil {
   533  		return errors.Wrap(err, "failed to migrate alertmanager state files")
   534  	}
   535  
   536  	for userID, files := range st {
   537  		tenantDir := am.getTenantDirectory(userID)
   538  		err := os.MkdirAll(tenantDir, 0777)
   539  		if err != nil {
   540  			return errors.Wrapf(err, "failed to create per-tenant directory %v", tenantDir)
   541  		}
   542  
   543  		errs := tsdb_errors.NewMulti()
   544  
   545  		if files.notificationLogSnapshot != "" {
   546  			errs.Add(migrate(files.notificationLogSnapshot, filepath.Join(tenantDir, notificationLogSnapshot)))
   547  		}
   548  
   549  		if files.silencesSnapshot != "" {
   550  			errs.Add(migrate(files.silencesSnapshot, filepath.Join(tenantDir, silencesSnapshot)))
   551  		}
   552  
   553  		if files.templatesDir != "" {
   554  			errs.Add(migrate(files.templatesDir, filepath.Join(tenantDir, templatesDir)))
   555  		}
   556  
   557  		if err := errs.Err(); err != nil {
   558  			return err
   559  		}
   560  	}
   561  	return nil
   562  }
   563  
   564  type obsoleteStateFiles struct {
   565  	notificationLogSnapshot string
   566  	silencesSnapshot        string
   567  	templatesDir            string
   568  }
   569  
   570  // getObsoleteFilesPerUser returns per-user set of files that should be migrated from old structure to new structure.
   571  func (am *MultitenantAlertmanager) getObsoleteFilesPerUser() (map[string]obsoleteStateFiles, error) {
   572  	files, err := ioutil.ReadDir(am.cfg.DataDir)
   573  	if err != nil {
   574  		return nil, errors.Wrapf(err, "failed to list dir %v", am.cfg.DataDir)
   575  	}
   576  
   577  	// old names
   578  	const (
   579  		notificationLogPrefix = "nflog:"
   580  		silencesPrefix        = "silences:"
   581  		templates             = "templates"
   582  	)
   583  
   584  	result := map[string]obsoleteStateFiles{}
   585  
   586  	for _, f := range files {
   587  		fullPath := filepath.Join(am.cfg.DataDir, f.Name())
   588  
   589  		if f.IsDir() {
   590  			// Process templates dir.
   591  			if f.Name() != templates {
   592  				// Ignore other files -- those are likely per tenant directories.
   593  				continue
   594  			}
   595  
   596  			templateDirs, err := ioutil.ReadDir(fullPath)
   597  			if err != nil {
   598  				return nil, errors.Wrapf(err, "failed to list dir %v", fullPath)
   599  			}
   600  
   601  			// Previously templates directory contained per-tenant subdirectory.
   602  			for _, d := range templateDirs {
   603  				if d.IsDir() {
   604  					v := result[d.Name()]
   605  					v.templatesDir = filepath.Join(fullPath, d.Name())
   606  					result[d.Name()] = v
   607  				} else {
   608  					level.Warn(am.logger).Log("msg", "ignoring unknown local file while migrating local alertmanager state files", "file", filepath.Join(fullPath, d.Name()))
   609  				}
   610  			}
   611  			continue
   612  		}
   613  
   614  		switch {
   615  		case strings.HasPrefix(f.Name(), notificationLogPrefix):
   616  			userID := strings.TrimPrefix(f.Name(), notificationLogPrefix)
   617  			v := result[userID]
   618  			v.notificationLogSnapshot = fullPath
   619  			result[userID] = v
   620  
   621  		case strings.HasPrefix(f.Name(), silencesPrefix):
   622  			userID := strings.TrimPrefix(f.Name(), silencesPrefix)
   623  			v := result[userID]
   624  			v.silencesSnapshot = fullPath
   625  			result[userID] = v
   626  
   627  		default:
   628  			level.Warn(am.logger).Log("msg", "ignoring unknown local data file while migrating local alertmanager state files", "file", fullPath)
   629  		}
   630  	}
   631  
   632  	return result, nil
   633  }
   634  
   635  func (am *MultitenantAlertmanager) run(ctx context.Context) error {
   636  	tick := time.NewTicker(am.cfg.PollInterval)
   637  	defer tick.Stop()
   638  
   639  	var ringTickerChan <-chan time.Time
   640  
   641  	if am.cfg.ShardingEnabled {
   642  		ringTicker := time.NewTicker(util.DurationWithJitter(am.cfg.ShardingRing.RingCheckPeriod, 0.2))
   643  		defer ringTicker.Stop()
   644  		ringTickerChan = ringTicker.C
   645  	}
   646  
   647  	for {
   648  		select {
   649  		case <-ctx.Done():
   650  			return nil
   651  		case err := <-am.subservicesWatcher.Chan():
   652  			return errors.Wrap(err, "alertmanager subservices failed")
   653  		case <-tick.C:
   654  			// We don't want to halt execution here but instead just log what happened.
   655  			if err := am.loadAndSyncConfigs(ctx, reasonPeriodic); err != nil {
   656  				level.Warn(am.logger).Log("msg", "error while synchronizing alertmanager configs", "err", err)
   657  			}
   658  		case <-ringTickerChan:
   659  			// We ignore the error because in case of error it will return an empty
   660  			// replication set which we use to compare with the previous state.
   661  			currRingState, _ := am.ring.GetAllHealthy(RingOp)
   662  
   663  			if ring.HasReplicationSetChanged(am.ringLastState, currRingState) {
   664  				am.ringLastState = currRingState
   665  				if err := am.loadAndSyncConfigs(ctx, reasonRingChange); err != nil {
   666  					level.Warn(am.logger).Log("msg", "error while synchronizing alertmanager configs", "err", err)
   667  				}
   668  			}
   669  		}
   670  	}
   671  }
   672  
   673  func (am *MultitenantAlertmanager) loadAndSyncConfigs(ctx context.Context, syncReason string) error {
   674  	level.Info(am.logger).Log("msg", "synchronizing alertmanager configs for users")
   675  	am.syncTotal.WithLabelValues(syncReason).Inc()
   676  
   677  	allUsers, cfgs, err := am.loadAlertmanagerConfigs(ctx)
   678  	if err != nil {
   679  		am.syncFailures.WithLabelValues(syncReason).Inc()
   680  		return err
   681  	}
   682  
   683  	am.syncConfigs(cfgs)
   684  	am.deleteUnusedLocalUserState()
   685  
   686  	// Currently, remote state persistence is only used when sharding is enabled.
   687  	if am.cfg.ShardingEnabled {
   688  		// Note when cleaning up remote state, remember that the user may not necessarily be configured
   689  		// in this instance. Therefore, pass the list of _all_ configured users to filter by.
   690  		am.deleteUnusedRemoteUserState(ctx, allUsers)
   691  	}
   692  
   693  	return nil
   694  }
   695  
   696  func (am *MultitenantAlertmanager) waitInitialStateSync(ctx context.Context) error {
   697  	am.alertmanagersMtx.Lock()
   698  	ams := make([]*Alertmanager, 0, len(am.alertmanagers))
   699  	for _, userAM := range am.alertmanagers {
   700  		ams = append(ams, userAM)
   701  	}
   702  	am.alertmanagersMtx.Unlock()
   703  
   704  	for _, userAM := range ams {
   705  		if err := userAM.WaitInitialStateSync(ctx); err != nil {
   706  			return err
   707  		}
   708  	}
   709  
   710  	return nil
   711  }
   712  
   713  // stopping runs when MultitenantAlertmanager transitions to Stopping state.
   714  func (am *MultitenantAlertmanager) stopping(_ error) error {
   715  	am.alertmanagersMtx.Lock()
   716  	for _, am := range am.alertmanagers {
   717  		am.StopAndWait()
   718  	}
   719  	am.alertmanagersMtx.Unlock()
   720  	if am.peer != nil { // Tests don't setup any peer.
   721  		err := am.peer.Leave(am.cfg.Cluster.PeerTimeout)
   722  		if err != nil {
   723  			level.Warn(am.logger).Log("msg", "failed to leave the cluster", "err", err)
   724  		}
   725  	}
   726  
   727  	if am.subservices != nil {
   728  		// subservices manages ring and lifecycler, if sharding was enabled.
   729  		_ = services.StopManagerAndAwaitStopped(context.Background(), am.subservices)
   730  	}
   731  	return nil
   732  }
   733  
   734  // loadAlertmanagerConfigs Loads (and filters) the alertmanagers configuration from object storage, taking into consideration the sharding strategy. Returns:
   735  // - The list of discovered users (all users with a configuration in storage)
   736  // - The configurations of users owned by this instance.
   737  func (am *MultitenantAlertmanager) loadAlertmanagerConfigs(ctx context.Context) ([]string, map[string]alertspb.AlertConfigDesc, error) {
   738  	// Find all users with an alertmanager config.
   739  	allUserIDs, err := am.store.ListAllUsers(ctx)
   740  	if err != nil {
   741  		return nil, nil, errors.Wrap(err, "failed to list users with alertmanager configuration")
   742  	}
   743  	numUsersDiscovered := len(allUserIDs)
   744  	ownedUserIDs := make([]string, 0, len(allUserIDs))
   745  
   746  	// Filter out users not owned by this shard.
   747  	for _, userID := range allUserIDs {
   748  		if am.isUserOwned(userID) {
   749  			ownedUserIDs = append(ownedUserIDs, userID)
   750  		}
   751  	}
   752  	numUsersOwned := len(ownedUserIDs)
   753  
   754  	// Load the configs for the owned users.
   755  	configs, err := am.store.GetAlertConfigs(ctx, ownedUserIDs)
   756  	if err != nil {
   757  		return nil, nil, errors.Wrapf(err, "failed to load alertmanager configurations for owned users")
   758  	}
   759  
   760  	am.tenantsDiscovered.Set(float64(numUsersDiscovered))
   761  	am.tenantsOwned.Set(float64(numUsersOwned))
   762  	return allUserIDs, configs, nil
   763  }
   764  
   765  func (am *MultitenantAlertmanager) isUserOwned(userID string) bool {
   766  	// If sharding is disabled, any alertmanager instance owns all users.
   767  	if !am.cfg.ShardingEnabled {
   768  		return true
   769  	}
   770  
   771  	alertmanagers, err := am.ring.Get(shardByUser(userID), SyncRingOp, nil, nil, nil)
   772  	if err != nil {
   773  		am.ringCheckErrors.Inc()
   774  		level.Error(am.logger).Log("msg", "failed to load alertmanager configuration", "user", userID, "err", err)
   775  		return false
   776  	}
   777  
   778  	return alertmanagers.Includes(am.ringLifecycler.GetInstanceAddr())
   779  }
   780  
   781  func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alertspb.AlertConfigDesc) {
   782  	level.Debug(am.logger).Log("msg", "adding configurations", "num_configs", len(cfgs))
   783  	for user, cfg := range cfgs {
   784  		err := am.setConfig(cfg)
   785  		if err != nil {
   786  			am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(0))
   787  			level.Warn(am.logger).Log("msg", "error applying config", "err", err)
   788  			continue
   789  		}
   790  
   791  		am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(1))
   792  		am.multitenantMetrics.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime()
   793  	}
   794  
   795  	userAlertmanagersToStop := map[string]*Alertmanager{}
   796  
   797  	am.alertmanagersMtx.Lock()
   798  	for userID, userAM := range am.alertmanagers {
   799  		if _, exists := cfgs[userID]; !exists {
   800  			userAlertmanagersToStop[userID] = userAM
   801  			delete(am.alertmanagers, userID)
   802  			delete(am.cfgs, userID)
   803  			am.multitenantMetrics.lastReloadSuccessful.DeleteLabelValues(userID)
   804  			am.multitenantMetrics.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID)
   805  			am.alertmanagerMetrics.removeUserRegistry(userID)
   806  		}
   807  	}
   808  	am.alertmanagersMtx.Unlock()
   809  
   810  	// Now stop alertmanagers and wait until they are really stopped, without holding lock.
   811  	for userID, userAM := range userAlertmanagersToStop {
   812  		level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", userID)
   813  		userAM.StopAndWait()
   814  		level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", userID)
   815  	}
   816  }
   817  
   818  // setConfig applies the given configuration to the alertmanager for `userID`,
   819  // creating an alertmanager if it doesn't already exist.
   820  func (am *MultitenantAlertmanager) setConfig(cfg alertspb.AlertConfigDesc) error {
   821  	var userAmConfig *amconfig.Config
   822  	var err error
   823  	var hasTemplateChanges bool
   824  	var userTemplateDir = filepath.Join(am.getTenantDirectory(cfg.User), templatesDir)
   825  	var pathsToRemove = make(map[string]struct{})
   826  
   827  	// List existing files to keep track the ones to be removed
   828  	if oldTemplateFiles, err := ioutil.ReadDir(userTemplateDir); err == nil {
   829  		for _, file := range oldTemplateFiles {
   830  			pathsToRemove[filepath.Join(userTemplateDir, file.Name())] = struct{}{}
   831  		}
   832  	}
   833  
   834  	for _, tmpl := range cfg.Templates {
   835  		templateFilePath, err := safeTemplateFilepath(userTemplateDir, tmpl.Filename)
   836  		if err != nil {
   837  			return err
   838  		}
   839  
   840  		// Removing from pathsToRemove map the files that still exists in the config
   841  		delete(pathsToRemove, templateFilePath)
   842  		hasChanged, err := storeTemplateFile(templateFilePath, tmpl.Body)
   843  		if err != nil {
   844  			return err
   845  		}
   846  
   847  		if hasChanged {
   848  			hasTemplateChanges = true
   849  		}
   850  	}
   851  
   852  	for pathToRemove := range pathsToRemove {
   853  		err := os.Remove(pathToRemove)
   854  		if err != nil {
   855  			level.Warn(am.logger).Log("msg", "failed to remove file", "file", pathToRemove, "err", err)
   856  		}
   857  		hasTemplateChanges = true
   858  	}
   859  
   860  	level.Debug(am.logger).Log("msg", "setting config", "user", cfg.User)
   861  
   862  	am.alertmanagersMtx.Lock()
   863  	defer am.alertmanagersMtx.Unlock()
   864  	existing, hasExisting := am.alertmanagers[cfg.User]
   865  
   866  	rawCfg := cfg.RawConfig
   867  	if cfg.RawConfig == "" {
   868  		if am.fallbackConfig == "" {
   869  			return fmt.Errorf("blank Alertmanager configuration for %v", cfg.User)
   870  		}
   871  		level.Debug(am.logger).Log("msg", "blank Alertmanager configuration; using fallback", "user", cfg.User)
   872  		userAmConfig, err = amconfig.Load(am.fallbackConfig)
   873  		if err != nil {
   874  			return fmt.Errorf("unable to load fallback configuration for %v: %v", cfg.User, err)
   875  		}
   876  		rawCfg = am.fallbackConfig
   877  	} else {
   878  		userAmConfig, err = amconfig.Load(cfg.RawConfig)
   879  		if err != nil && hasExisting {
   880  			// This means that if a user has a working config and
   881  			// they submit a broken one, the Manager will keep running the last known
   882  			// working configuration.
   883  			return fmt.Errorf("invalid Cortex configuration for %v: %v", cfg.User, err)
   884  		}
   885  	}
   886  
   887  	// We can have an empty configuration here if:
   888  	// 1) the user had a previous alertmanager
   889  	// 2) then, submitted a non-working configuration (and we kept running the prev working config)
   890  	// 3) finally, the cortex AM instance is restarted and the running version is no longer present
   891  	if userAmConfig == nil {
   892  		return fmt.Errorf("no usable Alertmanager configuration for %v", cfg.User)
   893  	}
   894  
   895  	// Transform webhook configs URLs to the per tenant monitor
   896  	if am.cfg.AutoWebhookRoot != "" {
   897  		for i, r := range userAmConfig.Receivers {
   898  			for j, w := range r.WebhookConfigs {
   899  				if w.URL.String() == autoWebhookURL {
   900  					u, err := url.Parse(am.cfg.AutoWebhookRoot + "/" + cfg.User + "/monitor")
   901  					if err != nil {
   902  						return err
   903  					}
   904  
   905  					userAmConfig.Receivers[i].WebhookConfigs[j].URL = &amconfig.URL{URL: u}
   906  				}
   907  			}
   908  		}
   909  	}
   910  
   911  	// If no Alertmanager instance exists for this user yet, start one.
   912  	if !hasExisting {
   913  		level.Debug(am.logger).Log("msg", "initializing new per-tenant alertmanager", "user", cfg.User)
   914  		newAM, err := am.newAlertmanager(cfg.User, userAmConfig, rawCfg)
   915  		if err != nil {
   916  			return err
   917  		}
   918  		am.alertmanagers[cfg.User] = newAM
   919  	} else if am.cfgs[cfg.User].RawConfig != cfg.RawConfig || hasTemplateChanges {
   920  		level.Info(am.logger).Log("msg", "updating new per-tenant alertmanager", "user", cfg.User)
   921  		// If the config changed, apply the new one.
   922  		err := existing.ApplyConfig(cfg.User, userAmConfig, rawCfg)
   923  		if err != nil {
   924  			return fmt.Errorf("unable to apply Alertmanager config for user %v: %v", cfg.User, err)
   925  		}
   926  	}
   927  
   928  	am.cfgs[cfg.User] = cfg
   929  	return nil
   930  }
   931  
   932  func (am *MultitenantAlertmanager) getTenantDirectory(userID string) string {
   933  	return filepath.Join(am.cfg.DataDir, userID)
   934  }
   935  
   936  func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amconfig.Config, rawCfg string) (*Alertmanager, error) {
   937  	reg := prometheus.NewRegistry()
   938  
   939  	tenantDir := am.getTenantDirectory(userID)
   940  	err := os.MkdirAll(tenantDir, 0777)
   941  	if err != nil {
   942  		return nil, errors.Wrapf(err, "failed to create per-tenant directory %v", tenantDir)
   943  	}
   944  
   945  	newAM, err := New(&Config{
   946  		UserID:            userID,
   947  		TenantDataDir:     tenantDir,
   948  		Logger:            am.logger,
   949  		Peer:              am.peer,
   950  		PeerTimeout:       am.cfg.Cluster.PeerTimeout,
   951  		Retention:         am.cfg.Retention,
   952  		ExternalURL:       am.cfg.ExternalURL.URL,
   953  		ShardingEnabled:   am.cfg.ShardingEnabled,
   954  		Replicator:        am,
   955  		ReplicationFactor: am.cfg.ShardingRing.ReplicationFactor,
   956  		Store:             am.store,
   957  		PersisterConfig:   am.cfg.Persister,
   958  		Limits:            am.limits,
   959  	}, reg)
   960  	if err != nil {
   961  		return nil, fmt.Errorf("unable to start Alertmanager for user %v: %v", userID, err)
   962  	}
   963  
   964  	if err := newAM.ApplyConfig(userID, amConfig, rawCfg); err != nil {
   965  		return nil, fmt.Errorf("unable to apply initial config for user %v: %v", userID, err)
   966  	}
   967  
   968  	am.alertmanagerMetrics.addUserRegistry(userID, reg)
   969  	return newAM, nil
   970  }
   971  
   972  // GetPositionForUser returns the position this Alertmanager instance holds in the ring related to its other replicas for an specific user.
   973  func (am *MultitenantAlertmanager) GetPositionForUser(userID string) int {
   974  	// If we have a replication factor of 1 or less we don't need to do any work and can immediately return.
   975  	if am.ring == nil || am.ring.ReplicationFactor() <= 1 {
   976  		return 0
   977  	}
   978  
   979  	set, err := am.ring.Get(shardByUser(userID), RingOp, nil, nil, nil)
   980  	if err != nil {
   981  		level.Error(am.logger).Log("msg", "unable to read the ring while trying to determine the alertmanager position", "err", err)
   982  		// If we're  unable to determine the position, we don't want a tenant to miss out on the notification - instead,
   983  		// just assume we're the first in line and run the risk of a double notification.
   984  		return 0
   985  	}
   986  
   987  	var position int
   988  	for i, instance := range set.Instances {
   989  		if instance.Addr == am.ringLifecycler.GetInstanceAddr() {
   990  			position = i
   991  			break
   992  		}
   993  	}
   994  
   995  	return position
   996  }
   997  
   998  // ServeHTTP serves the Alertmanager's web UI and API.
   999  func (am *MultitenantAlertmanager) ServeHTTP(w http.ResponseWriter, req *http.Request) {
  1000  	if am.State() != services.Running {
  1001  		http.Error(w, "Alertmanager not ready", http.StatusServiceUnavailable)
  1002  		return
  1003  	}
  1004  
  1005  	if am.cfg.ShardingEnabled && am.distributor.IsPathSupported(req.URL.Path) {
  1006  		am.distributor.DistributeRequest(w, req)
  1007  		return
  1008  	}
  1009  
  1010  	// If sharding is not enabled or Distributor does not support this path,
  1011  	// it is served by this instance.
  1012  	am.serveRequest(w, req)
  1013  }
  1014  
  1015  // HandleRequest implements gRPC Alertmanager service, which receives request from AlertManager-Distributor.
  1016  func (am *MultitenantAlertmanager) HandleRequest(ctx context.Context, in *httpgrpc.HTTPRequest) (*httpgrpc.HTTPResponse, error) {
  1017  	return am.grpcServer.Handle(ctx, in)
  1018  }
  1019  
  1020  // serveRequest serves the Alertmanager's web UI and API.
  1021  func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http.Request) {
  1022  	userID, err := tenant.TenantID(req.Context())
  1023  	if err != nil {
  1024  		http.Error(w, err.Error(), http.StatusUnauthorized)
  1025  		return
  1026  	}
  1027  	am.alertmanagersMtx.Lock()
  1028  	userAM, ok := am.alertmanagers[userID]
  1029  	am.alertmanagersMtx.Unlock()
  1030  
  1031  	if ok {
  1032  		userAM.mux.ServeHTTP(w, req)
  1033  		return
  1034  	}
  1035  
  1036  	if am.fallbackConfig != "" {
  1037  		userAM, err = am.alertmanagerFromFallbackConfig(userID)
  1038  		if err != nil {
  1039  			level.Error(am.logger).Log("msg", "unable to initialize the Alertmanager with a fallback configuration", "user", userID, "err", err)
  1040  			http.Error(w, "Failed to initialize the Alertmanager", http.StatusInternalServerError)
  1041  			return
  1042  		}
  1043  
  1044  		userAM.mux.ServeHTTP(w, req)
  1045  		return
  1046  	}
  1047  
  1048  	level.Debug(am.logger).Log("msg", "the Alertmanager has no configuration and no fallback specified", "user", userID)
  1049  	http.Error(w, "the Alertmanager is not configured", http.StatusNotFound)
  1050  }
  1051  
  1052  func (am *MultitenantAlertmanager) alertmanagerFromFallbackConfig(userID string) (*Alertmanager, error) {
  1053  	// Upload an empty config so that the Alertmanager is no de-activated in the next poll
  1054  	cfgDesc := alertspb.ToProto("", nil, userID)
  1055  	err := am.store.SetAlertConfig(context.Background(), cfgDesc)
  1056  	if err != nil {
  1057  		return nil, err
  1058  	}
  1059  
  1060  	// Calling setConfig with an empty configuration will use the fallback config.
  1061  	err = am.setConfig(cfgDesc)
  1062  	if err != nil {
  1063  		return nil, err
  1064  	}
  1065  
  1066  	am.alertmanagersMtx.Lock()
  1067  	defer am.alertmanagersMtx.Unlock()
  1068  	return am.alertmanagers[userID], nil
  1069  }
  1070  
  1071  // ReplicateStateForUser attempts to replicate a partial state sent by an alertmanager to its other replicas through the ring.
  1072  func (am *MultitenantAlertmanager) ReplicateStateForUser(ctx context.Context, userID string, part *clusterpb.Part) error {
  1073  	level.Debug(am.logger).Log("msg", "message received for replication", "user", userID, "key", part.Key)
  1074  
  1075  	selfAddress := am.ringLifecycler.GetInstanceAddr()
  1076  	err := ring.DoBatch(ctx, RingOp, am.ring, []uint32{shardByUser(userID)}, func(desc ring.InstanceDesc, _ []int) error {
  1077  		if desc.GetAddr() == selfAddress {
  1078  			return nil
  1079  		}
  1080  
  1081  		c, err := am.alertmanagerClientsPool.GetClientFor(desc.GetAddr())
  1082  		if err != nil {
  1083  			return err
  1084  		}
  1085  
  1086  		resp, err := c.UpdateState(user.InjectOrgID(ctx, userID), part)
  1087  		if err != nil {
  1088  			return err
  1089  		}
  1090  
  1091  		switch resp.Status {
  1092  		case alertmanagerpb.MERGE_ERROR:
  1093  			level.Error(am.logger).Log("msg", "state replication failed", "user", userID, "key", part.Key, "err", resp.Error)
  1094  		case alertmanagerpb.USER_NOT_FOUND:
  1095  			level.Debug(am.logger).Log("msg", "user not found while trying to replicate state", "user", userID, "key", part.Key)
  1096  		}
  1097  		return nil
  1098  	}, func() {})
  1099  
  1100  	return err
  1101  }
  1102  
  1103  // ReadFullStateForUser attempts to read the full state from each replica for user. Note that it will try to obtain and return
  1104  // state from all replicas, but will consider it a success if state is obtained from at least one replica.
  1105  func (am *MultitenantAlertmanager) ReadFullStateForUser(ctx context.Context, userID string) ([]*clusterpb.FullState, error) {
  1106  	// Only get the set of replicas which contain the specified user.
  1107  	key := shardByUser(userID)
  1108  	replicationSet, err := am.ring.Get(key, RingOp, nil, nil, nil)
  1109  	if err != nil {
  1110  		return nil, err
  1111  	}
  1112  
  1113  	// We should only query state from other replicas, and not our own state.
  1114  	addrs := replicationSet.GetAddressesWithout(am.ringLifecycler.GetInstanceAddr())
  1115  
  1116  	var (
  1117  		resultsMtx sync.Mutex
  1118  		results    []*clusterpb.FullState
  1119  	)
  1120  
  1121  	// Note that the jobs swallow the errors - this is because we want to give each replica a chance to respond.
  1122  	jobs := concurrency.CreateJobsFromStrings(addrs)
  1123  	err = concurrency.ForEach(ctx, jobs, len(jobs), func(ctx context.Context, job interface{}) error {
  1124  		addr := job.(string)
  1125  		level.Debug(am.logger).Log("msg", "contacting replica for full state", "user", userID, "addr", addr)
  1126  
  1127  		c, err := am.alertmanagerClientsPool.GetClientFor(addr)
  1128  		if err != nil {
  1129  			level.Error(am.logger).Log("msg", "failed to get rpc client", "err", err)
  1130  			return nil
  1131  		}
  1132  
  1133  		resp, err := c.ReadState(user.InjectOrgID(ctx, userID), &alertmanagerpb.ReadStateRequest{})
  1134  		if err != nil {
  1135  			level.Error(am.logger).Log("msg", "rpc reading state from replica failed", "addr", addr, "user", userID, "err", err)
  1136  			return nil
  1137  		}
  1138  
  1139  		switch resp.Status {
  1140  		case alertmanagerpb.READ_OK:
  1141  			resultsMtx.Lock()
  1142  			results = append(results, resp.State)
  1143  			resultsMtx.Unlock()
  1144  		case alertmanagerpb.READ_ERROR:
  1145  			level.Error(am.logger).Log("msg", "error trying to read state", "addr", addr, "user", userID, "err", resp.Error)
  1146  		case alertmanagerpb.READ_USER_NOT_FOUND:
  1147  			level.Debug(am.logger).Log("msg", "user not found while trying to read state", "addr", addr, "user", userID)
  1148  		default:
  1149  			level.Error(am.logger).Log("msg", "unknown response trying to read state", "addr", addr, "user", userID)
  1150  		}
  1151  		return nil
  1152  	})
  1153  	if err != nil {
  1154  		return nil, err
  1155  	}
  1156  
  1157  	// We only require the state from a single replica, though we return as many as we were able to obtain.
  1158  	if len(results) == 0 {
  1159  		return nil, fmt.Errorf("failed to read state from any replica")
  1160  	}
  1161  
  1162  	return results, nil
  1163  }
  1164  
  1165  // UpdateState implements the Alertmanager service.
  1166  func (am *MultitenantAlertmanager) UpdateState(ctx context.Context, part *clusterpb.Part) (*alertmanagerpb.UpdateStateResponse, error) {
  1167  	userID, err := tenant.TenantID(ctx)
  1168  	if err != nil {
  1169  		return nil, err
  1170  	}
  1171  
  1172  	am.alertmanagersMtx.Lock()
  1173  	userAM, ok := am.alertmanagers[userID]
  1174  	am.alertmanagersMtx.Unlock()
  1175  
  1176  	if !ok {
  1177  		// We can end up trying to replicate state to an alertmanager that is no longer available due to e.g. a ring topology change.
  1178  		level.Debug(am.logger).Log("msg", "user does not have an alertmanager in this instance", "user", userID)
  1179  		return &alertmanagerpb.UpdateStateResponse{
  1180  			Status: alertmanagerpb.USER_NOT_FOUND,
  1181  			Error:  "alertmanager for this user does not exists",
  1182  		}, nil
  1183  	}
  1184  
  1185  	if err = userAM.mergePartialExternalState(part); err != nil {
  1186  		return &alertmanagerpb.UpdateStateResponse{
  1187  			Status: alertmanagerpb.MERGE_ERROR,
  1188  			Error:  err.Error(),
  1189  		}, nil
  1190  	}
  1191  
  1192  	return &alertmanagerpb.UpdateStateResponse{Status: alertmanagerpb.OK}, nil
  1193  }
  1194  
  1195  // deleteUnusedRemoteUserState deletes state objects in remote storage for users that are no longer configured.
  1196  func (am *MultitenantAlertmanager) deleteUnusedRemoteUserState(ctx context.Context, allUsers []string) {
  1197  
  1198  	users := make(map[string]struct{}, len(allUsers))
  1199  	for _, userID := range allUsers {
  1200  		users[userID] = struct{}{}
  1201  	}
  1202  
  1203  	usersWithState, err := am.store.ListUsersWithFullState(ctx)
  1204  	if err != nil {
  1205  		level.Warn(am.logger).Log("msg", "failed to list users with state", "err", err)
  1206  		return
  1207  	}
  1208  
  1209  	for _, userID := range usersWithState {
  1210  		if _, ok := users[userID]; ok {
  1211  			continue
  1212  		}
  1213  
  1214  		err := am.store.DeleteFullState(ctx, userID)
  1215  		if err != nil {
  1216  			level.Warn(am.logger).Log("msg", "failed to delete remote state for user", "user", userID, "err", err)
  1217  		} else {
  1218  			level.Info(am.logger).Log("msg", "deleted remote state for user", "user", userID)
  1219  		}
  1220  	}
  1221  }
  1222  
  1223  // deleteUnusedLocalUserState deletes local files for users that we no longer need.
  1224  func (am *MultitenantAlertmanager) deleteUnusedLocalUserState() {
  1225  	userDirs := am.getPerUserDirectories()
  1226  
  1227  	// And delete remaining files.
  1228  	for userID, dir := range userDirs {
  1229  		am.alertmanagersMtx.Lock()
  1230  		userAM := am.alertmanagers[userID]
  1231  		am.alertmanagersMtx.Unlock()
  1232  
  1233  		// Don't delete directory if AM for user still exists.
  1234  		if userAM != nil {
  1235  			continue
  1236  		}
  1237  
  1238  		err := os.RemoveAll(dir)
  1239  		if err != nil {
  1240  			level.Warn(am.logger).Log("msg", "failed to delete directory for user", "dir", dir, "user", userID, "err", err)
  1241  		} else {
  1242  			level.Info(am.logger).Log("msg", "deleted local directory for user", "dir", dir, "user", userID)
  1243  		}
  1244  	}
  1245  }
  1246  
  1247  // getPerUserDirectories returns map of users to their directories (full path). Only users with local
  1248  // directory are returned.
  1249  func (am *MultitenantAlertmanager) getPerUserDirectories() map[string]string {
  1250  	files, err := ioutil.ReadDir(am.cfg.DataDir)
  1251  	if err != nil {
  1252  		level.Warn(am.logger).Log("msg", "failed to list local dir", "dir", am.cfg.DataDir, "err", err)
  1253  		return nil
  1254  	}
  1255  
  1256  	result := map[string]string{}
  1257  
  1258  	for _, f := range files {
  1259  		fullPath := filepath.Join(am.cfg.DataDir, f.Name())
  1260  
  1261  		if !f.IsDir() {
  1262  			level.Warn(am.logger).Log("msg", "ignoring unexpected file while scanning local alertmanager configs", "file", fullPath)
  1263  			continue
  1264  		}
  1265  
  1266  		result[f.Name()] = fullPath
  1267  	}
  1268  	return result
  1269  }
  1270  
  1271  // UpdateState implements the Alertmanager service.
  1272  func (am *MultitenantAlertmanager) ReadState(ctx context.Context, req *alertmanagerpb.ReadStateRequest) (*alertmanagerpb.ReadStateResponse, error) {
  1273  	userID, err := tenant.TenantID(ctx)
  1274  	if err != nil {
  1275  		return nil, err
  1276  	}
  1277  
  1278  	am.alertmanagersMtx.Lock()
  1279  	userAM, ok := am.alertmanagers[userID]
  1280  	am.alertmanagersMtx.Unlock()
  1281  
  1282  	if !ok {
  1283  		level.Debug(am.logger).Log("msg", "user does not have an alertmanager in this instance", "user", userID)
  1284  		return &alertmanagerpb.ReadStateResponse{
  1285  			Status: alertmanagerpb.READ_USER_NOT_FOUND,
  1286  			Error:  "alertmanager for this user does not exists",
  1287  		}, nil
  1288  	}
  1289  
  1290  	state, err := userAM.getFullState()
  1291  	if err != nil {
  1292  		return &alertmanagerpb.ReadStateResponse{
  1293  			Status: alertmanagerpb.READ_ERROR,
  1294  			Error:  err.Error(),
  1295  		}, nil
  1296  	}
  1297  
  1298  	return &alertmanagerpb.ReadStateResponse{
  1299  		Status: alertmanagerpb.READ_OK,
  1300  		State:  state,
  1301  	}, nil
  1302  }
  1303  
  1304  // validateTemplateFilename validated the template filename and returns error if it's not valid.
  1305  // The validation done in this function is a first fence to avoid having a tenant submitting
  1306  // a config which may escape the per-tenant data directory on disk.
  1307  func validateTemplateFilename(filename string) error {
  1308  	if filepath.Base(filename) != filename {
  1309  		return fmt.Errorf("invalid template name %q: the template name cannot contain any path", filename)
  1310  	}
  1311  
  1312  	// Further enforce no path in the template name.
  1313  	if filepath.Dir(filepath.Clean(filename)) != "." {
  1314  		return fmt.Errorf("invalid template name %q: the template name cannot contain any path", filename)
  1315  	}
  1316  
  1317  	return nil
  1318  }
  1319  
  1320  // safeTemplateFilepath builds and return the template filepath within the provided dir.
  1321  // This function also performs a security check to make sure the provided templateName
  1322  // doesn't contain a relative path escaping the provided dir.
  1323  func safeTemplateFilepath(dir, templateName string) (string, error) {
  1324  	// We expect all template files to be stored and referenced within the provided directory.
  1325  	containerDir, err := filepath.Abs(dir)
  1326  	if err != nil {
  1327  		return "", err
  1328  	}
  1329  
  1330  	// Build the actual path of the template.
  1331  	actualPath, err := filepath.Abs(filepath.Join(containerDir, templateName))
  1332  	if err != nil {
  1333  		return "", err
  1334  	}
  1335  
  1336  	// Ensure the actual path of the template is within the expected directory.
  1337  	// This check is a counter-measure to make sure the tenant is not trying to
  1338  	// escape its own directory on disk.
  1339  	if !strings.HasPrefix(actualPath, containerDir) {
  1340  		return "", fmt.Errorf("invalid template name %q: the template filepath is escaping the per-tenant local directory", templateName)
  1341  	}
  1342  
  1343  	return actualPath, nil
  1344  }
  1345  
  1346  // storeTemplateFile stores template file at the given templateFilepath.
  1347  // Returns true, if file content has changed (new or updated file), false if file with the same name
  1348  // and content was already stored locally.
  1349  func storeTemplateFile(templateFilepath, content string) (bool, error) {
  1350  	// Make sure the directory exists.
  1351  	dir := filepath.Dir(templateFilepath)
  1352  	err := os.MkdirAll(dir, 0755)
  1353  	if err != nil {
  1354  		return false, fmt.Errorf("unable to create Alertmanager templates directory %q: %s", dir, err)
  1355  	}
  1356  
  1357  	// Check if the template file already exists and if it has changed
  1358  	if tmpl, err := ioutil.ReadFile(templateFilepath); err == nil && string(tmpl) == content {
  1359  		return false, nil
  1360  	} else if err != nil && !os.IsNotExist(err) {
  1361  		return false, err
  1362  	}
  1363  
  1364  	if err := ioutil.WriteFile(templateFilepath, []byte(content), 0644); err != nil {
  1365  		return false, fmt.Errorf("unable to create Alertmanager template file %q: %s", templateFilepath, err)
  1366  	}
  1367  
  1368  	return true, nil
  1369  }