github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/alertmanager/multitenant.go (about) 1 package alertmanager 2 3 import ( 4 "context" 5 "flag" 6 "fmt" 7 "io/ioutil" 8 "net/http" 9 "net/url" 10 "os" 11 "path/filepath" 12 "strings" 13 "sync" 14 "time" 15 16 "github.com/go-kit/log" 17 "github.com/go-kit/log/level" 18 "github.com/grafana/dskit/concurrency" 19 "github.com/grafana/dskit/flagext" 20 "github.com/grafana/dskit/kv" 21 "github.com/grafana/dskit/ring" 22 "github.com/grafana/dskit/ring/client" 23 "github.com/grafana/dskit/services" 24 "github.com/pkg/errors" 25 "github.com/prometheus/alertmanager/cluster" 26 "github.com/prometheus/alertmanager/cluster/clusterpb" 27 amconfig "github.com/prometheus/alertmanager/config" 28 "github.com/prometheus/client_golang/prometheus" 29 "github.com/prometheus/client_golang/prometheus/promauto" 30 tsdb_errors "github.com/prometheus/prometheus/tsdb/errors" 31 "github.com/weaveworks/common/httpgrpc" 32 "github.com/weaveworks/common/httpgrpc/server" 33 "github.com/weaveworks/common/user" 34 "golang.org/x/time/rate" 35 36 "github.com/cortexproject/cortex/pkg/alertmanager/alertmanagerpb" 37 "github.com/cortexproject/cortex/pkg/alertmanager/alertspb" 38 "github.com/cortexproject/cortex/pkg/alertmanager/alertstore" 39 "github.com/cortexproject/cortex/pkg/tenant" 40 "github.com/cortexproject/cortex/pkg/util" 41 util_log "github.com/cortexproject/cortex/pkg/util/log" 42 ) 43 44 const ( 45 // If a config sets the webhook URL to this, it will be rewritten to 46 // a URL derived from Config.AutoWebhookRoot 47 autoWebhookURL = "http://internal.monitor" 48 49 // Reasons for (re)syncing alertmanager configurations from object storage. 50 reasonPeriodic = "periodic" 51 reasonInitial = "initial" 52 reasonRingChange = "ring-change" 53 54 // ringAutoForgetUnhealthyPeriods is how many consecutive timeout periods an unhealthy instance 55 // in the ring will be automatically removed. 56 ringAutoForgetUnhealthyPeriods = 5 57 ) 58 59 var ( 60 errInvalidExternalURL = errors.New("the configured external URL is invalid: should not end with /") 61 errShardingLegacyStorage = errors.New("deprecated -alertmanager.storage.* not supported with -alertmanager.sharding-enabled, use -alertmanager-storage.*") 62 errShardingUnsupportedStorage = errors.New("the configured alertmanager storage backend is not supported when sharding is enabled") 63 errZoneAwarenessEnabledWithoutZoneInfo = errors.New("the configured alertmanager has zone awareness enabled but zone is not set") 64 ) 65 66 // MultitenantAlertmanagerConfig is the configuration for a multitenant Alertmanager. 67 type MultitenantAlertmanagerConfig struct { 68 DataDir string `yaml:"data_dir"` 69 Retention time.Duration `yaml:"retention"` 70 ExternalURL flagext.URLValue `yaml:"external_url"` 71 PollInterval time.Duration `yaml:"poll_interval"` 72 MaxRecvMsgSize int64 `yaml:"max_recv_msg_size"` 73 74 // Enable sharding for the Alertmanager 75 ShardingEnabled bool `yaml:"sharding_enabled"` 76 ShardingRing RingConfig `yaml:"sharding_ring"` 77 78 FallbackConfigFile string `yaml:"fallback_config_file"` 79 AutoWebhookRoot string `yaml:"auto_webhook_root"` 80 81 Store alertstore.LegacyConfig `yaml:"storage" doc:"description=Deprecated. Use -alertmanager-storage.* CLI flags and their respective YAML config options instead."` 82 Cluster ClusterConfig `yaml:"cluster"` 83 84 EnableAPI bool `yaml:"enable_api"` 85 86 // For distributor. 87 AlertmanagerClient ClientConfig `yaml:"alertmanager_client"` 88 89 // For the state persister. 90 Persister PersisterConfig `yaml:",inline"` 91 } 92 93 type ClusterConfig struct { 94 ListenAddr string `yaml:"listen_address"` 95 AdvertiseAddr string `yaml:"advertise_address"` 96 Peers flagext.StringSliceCSV `yaml:"peers"` 97 PeerTimeout time.Duration `yaml:"peer_timeout"` 98 GossipInterval time.Duration `yaml:"gossip_interval"` 99 PushPullInterval time.Duration `yaml:"push_pull_interval"` 100 } 101 102 const ( 103 defaultClusterAddr = "0.0.0.0:9094" 104 defaultPeerTimeout = 15 * time.Second 105 ) 106 107 // RegisterFlags adds the flags required to config this to the given FlagSet. 108 func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) { 109 f.StringVar(&cfg.DataDir, "alertmanager.storage.path", "data/", "Base path for data storage.") 110 f.DurationVar(&cfg.Retention, "alertmanager.storage.retention", 5*24*time.Hour, "How long to keep data for.") 111 f.Int64Var(&cfg.MaxRecvMsgSize, "alertmanager.max-recv-msg-size", 16<<20, "Maximum size (bytes) of an accepted HTTP request body.") 112 113 f.Var(&cfg.ExternalURL, "alertmanager.web.external-url", "The URL under which Alertmanager is externally reachable (for example, if Alertmanager is served via a reverse proxy). Used for generating relative and absolute links back to Alertmanager itself. If the URL has a path portion, it will be used to prefix all HTTP endpoints served by Alertmanager. If omitted, relevant URL components will be derived automatically.") 114 115 f.StringVar(&cfg.FallbackConfigFile, "alertmanager.configs.fallback", "", "Filename of fallback config to use if none specified for instance.") 116 f.StringVar(&cfg.AutoWebhookRoot, "alertmanager.configs.auto-webhook-root", "", "Root of URL to generate if config is "+autoWebhookURL) 117 f.DurationVar(&cfg.PollInterval, "alertmanager.configs.poll-interval", 15*time.Second, "How frequently to poll Cortex configs") 118 119 f.BoolVar(&cfg.EnableAPI, "experimental.alertmanager.enable-api", false, "Enable the experimental alertmanager config api.") 120 121 f.BoolVar(&cfg.ShardingEnabled, "alertmanager.sharding-enabled", false, "Shard tenants across multiple alertmanager instances.") 122 123 cfg.AlertmanagerClient.RegisterFlagsWithPrefix("alertmanager.alertmanager-client", f) 124 cfg.Persister.RegisterFlagsWithPrefix("alertmanager", f) 125 cfg.ShardingRing.RegisterFlags(f) 126 cfg.Store.RegisterFlags(f) 127 cfg.Cluster.RegisterFlags(f) 128 } 129 130 func (cfg *ClusterConfig) RegisterFlags(f *flag.FlagSet) { 131 prefix := "alertmanager.cluster." 132 f.StringVar(&cfg.ListenAddr, prefix+"listen-address", defaultClusterAddr, "Listen address and port for the cluster. Not specifying this flag disables high-availability mode.") 133 f.StringVar(&cfg.AdvertiseAddr, prefix+"advertise-address", "", "Explicit address or hostname to advertise in cluster.") 134 f.Var(&cfg.Peers, prefix+"peers", "Comma-separated list of initial peers.") 135 f.DurationVar(&cfg.PeerTimeout, prefix+"peer-timeout", defaultPeerTimeout, "Time to wait between peers to send notifications.") 136 f.DurationVar(&cfg.GossipInterval, prefix+"gossip-interval", cluster.DefaultGossipInterval, "The interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across cluster more quickly at the expense of increased bandwidth usage.") 137 f.DurationVar(&cfg.PushPullInterval, prefix+"push-pull-interval", cluster.DefaultPushPullInterval, "The interval between gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.") 138 } 139 140 // Validate config and returns error on failure 141 func (cfg *MultitenantAlertmanagerConfig) Validate(storageCfg alertstore.Config) error { 142 if cfg.ExternalURL.URL != nil && strings.HasSuffix(cfg.ExternalURL.Path, "/") { 143 return errInvalidExternalURL 144 } 145 146 if err := cfg.Store.Validate(); err != nil { 147 return errors.Wrap(err, "invalid storage config") 148 } 149 150 if err := cfg.Persister.Validate(); err != nil { 151 return err 152 } 153 154 if cfg.ShardingEnabled { 155 if !cfg.Store.IsDefaults() { 156 return errShardingLegacyStorage 157 } 158 if !storageCfg.IsFullStateSupported() { 159 return errShardingUnsupportedStorage 160 } 161 if cfg.ShardingRing.ZoneAwarenessEnabled && cfg.ShardingRing.InstanceZone == "" { 162 return errZoneAwarenessEnabledWithoutZoneInfo 163 } 164 } 165 166 return nil 167 } 168 169 type multitenantAlertmanagerMetrics struct { 170 lastReloadSuccessful *prometheus.GaugeVec 171 lastReloadSuccessfulTimestamp *prometheus.GaugeVec 172 } 173 174 func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics { 175 m := &multitenantAlertmanagerMetrics{} 176 177 m.lastReloadSuccessful = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ 178 Namespace: "cortex", 179 Name: "alertmanager_config_last_reload_successful", 180 Help: "Boolean set to 1 whenever the last configuration reload attempt was successful.", 181 }, []string{"user"}) 182 183 m.lastReloadSuccessfulTimestamp = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ 184 Namespace: "cortex", 185 Name: "alertmanager_config_last_reload_successful_seconds", 186 Help: "Timestamp of the last successful configuration reload.", 187 }, []string{"user"}) 188 189 return m 190 } 191 192 // Limits defines limits used by Alertmanager. 193 type Limits interface { 194 // AlertmanagerReceiversBlockCIDRNetworks returns the list of network CIDRs that should be blocked 195 // in the Alertmanager receivers for the given user. 196 AlertmanagerReceiversBlockCIDRNetworks(user string) []flagext.CIDR 197 198 // AlertmanagerReceiversBlockPrivateAddresses returns true if private addresses should be blocked 199 // in the Alertmanager receivers for the given user. 200 AlertmanagerReceiversBlockPrivateAddresses(user string) bool 201 202 // NotificationRateLimit methods return limit used by rate-limiter for given integration. 203 // If set to 0, no notifications are allowed. 204 // rate.Inf = all notifications are allowed. 205 // 206 // Note that when negative or zero values specified by user are translated to rate.Limit by Overrides, 207 // and may have different meaning there. 208 NotificationRateLimit(tenant string, integration string) rate.Limit 209 210 // NotificationBurstSize returns burst-size for rate limiter for given integration type. If 0, no notifications are allowed except 211 // when limit == rate.Inf. 212 NotificationBurstSize(tenant string, integration string) int 213 214 // AlertmanagerMaxConfigSize returns max size of configuration file that user is allowed to upload. If 0, there is no limit. 215 AlertmanagerMaxConfigSize(tenant string) int 216 217 // AlertmanagerMaxTemplatesCount returns max number of templates that tenant can use in the configuration. 0 = no limit. 218 AlertmanagerMaxTemplatesCount(tenant string) int 219 220 // AlertmanagerMaxTemplateSize returns max size of individual template. 0 = no limit. 221 AlertmanagerMaxTemplateSize(tenant string) int 222 223 // AlertmanagerMaxDispatcherAggregationGroups returns maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have. 224 // Each aggregation group consumes single goroutine. 0 = unlimited. 225 AlertmanagerMaxDispatcherAggregationGroups(t string) int 226 227 // AlertmanagerMaxAlertsCount returns max number of alerts that tenant can have active at the same time. 0 = no limit. 228 AlertmanagerMaxAlertsCount(tenant string) int 229 230 // AlertmanagerMaxAlertsSizeBytes returns total max size of alerts that tenant can have active at the same time. 0 = no limit. 231 // Size of the alert is computed from alert labels, annotations and generator URL. 232 AlertmanagerMaxAlertsSizeBytes(tenant string) int 233 } 234 235 // A MultitenantAlertmanager manages Alertmanager instances for multiple 236 // organizations. 237 type MultitenantAlertmanager struct { 238 services.Service 239 240 cfg *MultitenantAlertmanagerConfig 241 242 // Ring used for sharding alertmanager instances. 243 // When sharding is disabled, the flow is: 244 // ServeHTTP() -> serveRequest() 245 // When sharding is enabled: 246 // ServeHTTP() -> distributor.DistributeRequest() -> (sends to other AM or even the current) 247 // -> HandleRequest() (gRPC call) -> grpcServer() -> handlerForGRPCServer.ServeHTTP() -> serveRequest(). 248 ringLifecycler *ring.BasicLifecycler 249 ring *ring.Ring 250 distributor *Distributor 251 grpcServer *server.Server 252 253 // Last ring state. This variable is not protected with a mutex because it's always 254 // accessed by a single goroutine at a time. 255 ringLastState ring.ReplicationSet 256 257 // Subservices manager (ring, lifecycler) 258 subservices *services.Manager 259 subservicesWatcher *services.FailureWatcher 260 261 store alertstore.AlertStore 262 263 // The fallback config is stored as a string and parsed every time it's needed 264 // because we mutate the parsed results and don't want those changes to take 265 // effect here. 266 fallbackConfig string 267 268 alertmanagersMtx sync.Mutex 269 alertmanagers map[string]*Alertmanager 270 // Stores the current set of configurations we're running in each tenant's Alertmanager. 271 // Used for comparing configurations as we synchronize them. 272 cfgs map[string]alertspb.AlertConfigDesc 273 274 logger log.Logger 275 alertmanagerMetrics *alertmanagerMetrics 276 multitenantMetrics *multitenantAlertmanagerMetrics 277 278 peer *cluster.Peer 279 alertmanagerClientsPool ClientsPool 280 281 limits Limits 282 283 registry prometheus.Registerer 284 ringCheckErrors prometheus.Counter 285 tenantsOwned prometheus.Gauge 286 tenantsDiscovered prometheus.Gauge 287 syncTotal *prometheus.CounterVec 288 syncFailures *prometheus.CounterVec 289 } 290 291 // NewMultitenantAlertmanager creates a new MultitenantAlertmanager. 292 func NewMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, store alertstore.AlertStore, limits Limits, logger log.Logger, registerer prometheus.Registerer) (*MultitenantAlertmanager, error) { 293 err := os.MkdirAll(cfg.DataDir, 0777) 294 if err != nil { 295 return nil, fmt.Errorf("unable to create Alertmanager data directory %q: %s", cfg.DataDir, err) 296 } 297 298 if cfg.ExternalURL.URL == nil { 299 return nil, fmt.Errorf("unable to create Alertmanager because the external URL has not been configured") 300 } 301 302 var fallbackConfig []byte 303 if cfg.FallbackConfigFile != "" { 304 fallbackConfig, err = ioutil.ReadFile(cfg.FallbackConfigFile) 305 if err != nil { 306 return nil, fmt.Errorf("unable to read fallback config %q: %s", cfg.FallbackConfigFile, err) 307 } 308 _, err = amconfig.LoadFile(cfg.FallbackConfigFile) 309 if err != nil { 310 return nil, fmt.Errorf("unable to load fallback config %q: %s", cfg.FallbackConfigFile, err) 311 } 312 } 313 314 var peer *cluster.Peer 315 // We need to take this case into account to support our legacy upstream clustering. 316 if cfg.Cluster.ListenAddr != "" && !cfg.ShardingEnabled { 317 peer, err = cluster.Create( 318 log.With(logger, "component", "cluster"), 319 registerer, 320 cfg.Cluster.ListenAddr, 321 cfg.Cluster.AdvertiseAddr, 322 cfg.Cluster.Peers, 323 true, 324 cfg.Cluster.PushPullInterval, 325 cfg.Cluster.GossipInterval, 326 cluster.DefaultTcpTimeout, 327 cluster.DefaultProbeTimeout, 328 cluster.DefaultProbeInterval, 329 nil, 330 ) 331 if err != nil { 332 return nil, errors.Wrap(err, "unable to initialize gossip mesh") 333 } 334 err = peer.Join(cluster.DefaultReconnectInterval, cluster.DefaultReconnectTimeout) 335 if err != nil { 336 level.Warn(logger).Log("msg", "unable to join gossip mesh while initializing cluster for high availability mode", "err", err) 337 } 338 go peer.Settle(context.Background(), cluster.DefaultGossipInterval) 339 } 340 341 var ringStore kv.Client 342 if cfg.ShardingEnabled { 343 util_log.WarnExperimentalUse("Alertmanager sharding") 344 345 ringStore, err = kv.NewClient( 346 cfg.ShardingRing.KVStore, 347 ring.GetCodec(), 348 kv.RegistererWithKVName(prometheus.WrapRegistererWithPrefix("cortex_", registerer), "alertmanager"), 349 logger, 350 ) 351 if err != nil { 352 return nil, errors.Wrap(err, "create KV store client") 353 } 354 } 355 356 return createMultitenantAlertmanager(cfg, fallbackConfig, peer, store, ringStore, limits, logger, registerer) 357 } 358 359 func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackConfig []byte, peer *cluster.Peer, store alertstore.AlertStore, ringStore kv.Client, limits Limits, logger log.Logger, registerer prometheus.Registerer) (*MultitenantAlertmanager, error) { 360 am := &MultitenantAlertmanager{ 361 cfg: cfg, 362 fallbackConfig: string(fallbackConfig), 363 cfgs: map[string]alertspb.AlertConfigDesc{}, 364 alertmanagers: map[string]*Alertmanager{}, 365 alertmanagerMetrics: newAlertmanagerMetrics(), 366 multitenantMetrics: newMultitenantAlertmanagerMetrics(registerer), 367 peer: peer, 368 store: store, 369 logger: log.With(logger, "component", "MultiTenantAlertmanager"), 370 registry: registerer, 371 limits: limits, 372 ringCheckErrors: promauto.With(registerer).NewCounter(prometheus.CounterOpts{ 373 Name: "cortex_alertmanager_ring_check_errors_total", 374 Help: "Number of errors that have occurred when checking the ring for ownership.", 375 }), 376 syncTotal: promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{ 377 Name: "cortex_alertmanager_sync_configs_total", 378 Help: "Total number of times the alertmanager sync operation triggered.", 379 }, []string{"reason"}), 380 syncFailures: promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{ 381 Name: "cortex_alertmanager_sync_configs_failed_total", 382 Help: "Total number of times the alertmanager sync operation failed.", 383 }, []string{"reason"}), 384 tenantsDiscovered: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{ 385 Name: "cortex_alertmanager_tenants_discovered", 386 Help: "Number of tenants with an Alertmanager configuration discovered.", 387 }), 388 tenantsOwned: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{ 389 Name: "cortex_alertmanager_tenants_owned", 390 Help: "Current number of tenants owned by the Alertmanager instance.", 391 }), 392 } 393 394 // Initialize the top-level metrics. 395 for _, r := range []string{reasonInitial, reasonPeriodic, reasonRingChange} { 396 am.syncTotal.WithLabelValues(r) 397 am.syncFailures.WithLabelValues(r) 398 } 399 400 if cfg.ShardingEnabled { 401 lifecyclerCfg, err := am.cfg.ShardingRing.ToLifecyclerConfig(am.logger) 402 if err != nil { 403 return nil, errors.Wrap(err, "failed to initialize Alertmanager's lifecycler config") 404 } 405 406 // Define lifecycler delegates in reverse order (last to be called defined first because they're 407 // chained via "next delegate"). 408 delegate := ring.BasicLifecyclerDelegate(am) 409 delegate = ring.NewLeaveOnStoppingDelegate(delegate, am.logger) 410 delegate = ring.NewAutoForgetDelegate(am.cfg.ShardingRing.HeartbeatTimeout*ringAutoForgetUnhealthyPeriods, delegate, am.logger) 411 412 am.ringLifecycler, err = ring.NewBasicLifecycler(lifecyclerCfg, RingNameForServer, RingKey, ringStore, delegate, am.logger, prometheus.WrapRegistererWithPrefix("cortex_", am.registry)) 413 if err != nil { 414 return nil, errors.Wrap(err, "failed to initialize Alertmanager's lifecycler") 415 } 416 417 am.ring, err = ring.NewWithStoreClientAndStrategy(am.cfg.ShardingRing.ToRingConfig(), RingNameForServer, RingKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), prometheus.WrapRegistererWithPrefix("cortex_", am.registry), am.logger) 418 if err != nil { 419 return nil, errors.Wrap(err, "failed to initialize Alertmanager's ring") 420 } 421 422 am.grpcServer = server.NewServer(&handlerForGRPCServer{am: am}) 423 424 am.alertmanagerClientsPool = newAlertmanagerClientsPool(client.NewRingServiceDiscovery(am.ring), cfg.AlertmanagerClient, logger, am.registry) 425 am.distributor, err = NewDistributor(cfg.AlertmanagerClient, cfg.MaxRecvMsgSize, am.ring, am.alertmanagerClientsPool, log.With(logger, "component", "AlertmanagerDistributor"), am.registry) 426 if err != nil { 427 return nil, errors.Wrap(err, "create distributor") 428 } 429 } 430 431 if registerer != nil { 432 registerer.MustRegister(am.alertmanagerMetrics) 433 } 434 435 am.Service = services.NewBasicService(am.starting, am.run, am.stopping) 436 437 return am, nil 438 } 439 440 // handlerForGRPCServer acts as a handler for gRPC server to serve 441 // the serveRequest() via the standard ServeHTTP. 442 type handlerForGRPCServer struct { 443 am *MultitenantAlertmanager 444 } 445 446 func (h *handlerForGRPCServer) ServeHTTP(w http.ResponseWriter, req *http.Request) { 447 h.am.serveRequest(w, req) 448 } 449 450 func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) { 451 err = am.migrateStateFilesToPerTenantDirectories() 452 if err != nil { 453 return err 454 } 455 456 defer func() { 457 if err == nil || am.subservices == nil { 458 return 459 } 460 461 if stopErr := services.StopManagerAndAwaitStopped(context.Background(), am.subservices); stopErr != nil { 462 level.Error(am.logger).Log("msg", "failed to gracefully stop alertmanager dependencies", "err", stopErr) 463 } 464 }() 465 466 if am.cfg.ShardingEnabled { 467 if am.subservices, err = services.NewManager(am.ringLifecycler, am.ring, am.distributor); err != nil { 468 return errors.Wrap(err, "failed to start alertmanager's subservices") 469 } 470 471 if err = services.StartManagerAndAwaitHealthy(ctx, am.subservices); err != nil { 472 return errors.Wrap(err, "failed to start alertmanager's subservices") 473 } 474 475 am.subservicesWatcher = services.NewFailureWatcher() 476 am.subservicesWatcher.WatchManager(am.subservices) 477 478 // We wait until the instance is in the JOINING state, once it does we know that tokens are assigned to this instance and we'll be ready to perform an initial sync of configs. 479 level.Info(am.logger).Log("waiting until alertmanager is JOINING in the ring") 480 if err = ring.WaitInstanceState(ctx, am.ring, am.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil { 481 return err 482 } 483 level.Info(am.logger).Log("msg", "alertmanager is JOINING in the ring") 484 } 485 486 // At this point, if sharding is enabled, the instance is registered with some tokens 487 // and we can run the initial iteration to sync configs. If no sharding is enabled we load _all_ the configs. 488 if err := am.loadAndSyncConfigs(ctx, reasonInitial); err != nil { 489 return err 490 } 491 492 if am.cfg.ShardingEnabled { 493 // Store the ring state after the initial Alertmanager configs sync has been done and before we do change 494 // our state in the ring. 495 am.ringLastState, _ = am.ring.GetAllHealthy(RingOp) 496 497 // Make sure that all the alertmanagers we were initially configured with have 498 // fetched state from the replicas, before advertising as ACTIVE. This will 499 // reduce the possibility that we lose state when new instances join/leave. 500 level.Info(am.logger).Log("msg", "waiting until initial state sync is complete for all users") 501 if err := am.waitInitialStateSync(ctx); err != nil { 502 return errors.Wrap(err, "failed to wait for initial state sync") 503 } 504 level.Info(am.logger).Log("msg", "initial state sync is complete") 505 506 // With the initial sync now completed, we should have loaded all assigned alertmanager configurations to this instance. We can switch it to ACTIVE and start serving requests. 507 if err := am.ringLifecycler.ChangeState(ctx, ring.ACTIVE); err != nil { 508 return errors.Wrapf(err, "switch instance to %s in the ring", ring.ACTIVE) 509 } 510 511 // Wait until the ring client detected this instance in the ACTIVE state. 512 level.Info(am.logger).Log("msg", "waiting until alertmanager is ACTIVE in the ring") 513 if err := ring.WaitInstanceState(ctx, am.ring, am.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil { 514 return err 515 } 516 level.Info(am.logger).Log("msg", "alertmanager is ACTIVE in the ring") 517 } 518 519 return nil 520 } 521 522 // migrateStateFilesToPerTenantDirectories migrates any existing configuration from old place to new hierarchy. 523 // TODO: Remove in Cortex 1.11. 524 func (am *MultitenantAlertmanager) migrateStateFilesToPerTenantDirectories() error { 525 migrate := func(from, to string) error { 526 level.Info(am.logger).Log("msg", "migrating alertmanager state", "from", from, "to", to) 527 err := os.Rename(from, to) 528 return errors.Wrapf(err, "failed to migrate alertmanager state from %v to %v", from, to) 529 } 530 531 st, err := am.getObsoleteFilesPerUser() 532 if err != nil { 533 return errors.Wrap(err, "failed to migrate alertmanager state files") 534 } 535 536 for userID, files := range st { 537 tenantDir := am.getTenantDirectory(userID) 538 err := os.MkdirAll(tenantDir, 0777) 539 if err != nil { 540 return errors.Wrapf(err, "failed to create per-tenant directory %v", tenantDir) 541 } 542 543 errs := tsdb_errors.NewMulti() 544 545 if files.notificationLogSnapshot != "" { 546 errs.Add(migrate(files.notificationLogSnapshot, filepath.Join(tenantDir, notificationLogSnapshot))) 547 } 548 549 if files.silencesSnapshot != "" { 550 errs.Add(migrate(files.silencesSnapshot, filepath.Join(tenantDir, silencesSnapshot))) 551 } 552 553 if files.templatesDir != "" { 554 errs.Add(migrate(files.templatesDir, filepath.Join(tenantDir, templatesDir))) 555 } 556 557 if err := errs.Err(); err != nil { 558 return err 559 } 560 } 561 return nil 562 } 563 564 type obsoleteStateFiles struct { 565 notificationLogSnapshot string 566 silencesSnapshot string 567 templatesDir string 568 } 569 570 // getObsoleteFilesPerUser returns per-user set of files that should be migrated from old structure to new structure. 571 func (am *MultitenantAlertmanager) getObsoleteFilesPerUser() (map[string]obsoleteStateFiles, error) { 572 files, err := ioutil.ReadDir(am.cfg.DataDir) 573 if err != nil { 574 return nil, errors.Wrapf(err, "failed to list dir %v", am.cfg.DataDir) 575 } 576 577 // old names 578 const ( 579 notificationLogPrefix = "nflog:" 580 silencesPrefix = "silences:" 581 templates = "templates" 582 ) 583 584 result := map[string]obsoleteStateFiles{} 585 586 for _, f := range files { 587 fullPath := filepath.Join(am.cfg.DataDir, f.Name()) 588 589 if f.IsDir() { 590 // Process templates dir. 591 if f.Name() != templates { 592 // Ignore other files -- those are likely per tenant directories. 593 continue 594 } 595 596 templateDirs, err := ioutil.ReadDir(fullPath) 597 if err != nil { 598 return nil, errors.Wrapf(err, "failed to list dir %v", fullPath) 599 } 600 601 // Previously templates directory contained per-tenant subdirectory. 602 for _, d := range templateDirs { 603 if d.IsDir() { 604 v := result[d.Name()] 605 v.templatesDir = filepath.Join(fullPath, d.Name()) 606 result[d.Name()] = v 607 } else { 608 level.Warn(am.logger).Log("msg", "ignoring unknown local file while migrating local alertmanager state files", "file", filepath.Join(fullPath, d.Name())) 609 } 610 } 611 continue 612 } 613 614 switch { 615 case strings.HasPrefix(f.Name(), notificationLogPrefix): 616 userID := strings.TrimPrefix(f.Name(), notificationLogPrefix) 617 v := result[userID] 618 v.notificationLogSnapshot = fullPath 619 result[userID] = v 620 621 case strings.HasPrefix(f.Name(), silencesPrefix): 622 userID := strings.TrimPrefix(f.Name(), silencesPrefix) 623 v := result[userID] 624 v.silencesSnapshot = fullPath 625 result[userID] = v 626 627 default: 628 level.Warn(am.logger).Log("msg", "ignoring unknown local data file while migrating local alertmanager state files", "file", fullPath) 629 } 630 } 631 632 return result, nil 633 } 634 635 func (am *MultitenantAlertmanager) run(ctx context.Context) error { 636 tick := time.NewTicker(am.cfg.PollInterval) 637 defer tick.Stop() 638 639 var ringTickerChan <-chan time.Time 640 641 if am.cfg.ShardingEnabled { 642 ringTicker := time.NewTicker(util.DurationWithJitter(am.cfg.ShardingRing.RingCheckPeriod, 0.2)) 643 defer ringTicker.Stop() 644 ringTickerChan = ringTicker.C 645 } 646 647 for { 648 select { 649 case <-ctx.Done(): 650 return nil 651 case err := <-am.subservicesWatcher.Chan(): 652 return errors.Wrap(err, "alertmanager subservices failed") 653 case <-tick.C: 654 // We don't want to halt execution here but instead just log what happened. 655 if err := am.loadAndSyncConfigs(ctx, reasonPeriodic); err != nil { 656 level.Warn(am.logger).Log("msg", "error while synchronizing alertmanager configs", "err", err) 657 } 658 case <-ringTickerChan: 659 // We ignore the error because in case of error it will return an empty 660 // replication set which we use to compare with the previous state. 661 currRingState, _ := am.ring.GetAllHealthy(RingOp) 662 663 if ring.HasReplicationSetChanged(am.ringLastState, currRingState) { 664 am.ringLastState = currRingState 665 if err := am.loadAndSyncConfigs(ctx, reasonRingChange); err != nil { 666 level.Warn(am.logger).Log("msg", "error while synchronizing alertmanager configs", "err", err) 667 } 668 } 669 } 670 } 671 } 672 673 func (am *MultitenantAlertmanager) loadAndSyncConfigs(ctx context.Context, syncReason string) error { 674 level.Info(am.logger).Log("msg", "synchronizing alertmanager configs for users") 675 am.syncTotal.WithLabelValues(syncReason).Inc() 676 677 allUsers, cfgs, err := am.loadAlertmanagerConfigs(ctx) 678 if err != nil { 679 am.syncFailures.WithLabelValues(syncReason).Inc() 680 return err 681 } 682 683 am.syncConfigs(cfgs) 684 am.deleteUnusedLocalUserState() 685 686 // Currently, remote state persistence is only used when sharding is enabled. 687 if am.cfg.ShardingEnabled { 688 // Note when cleaning up remote state, remember that the user may not necessarily be configured 689 // in this instance. Therefore, pass the list of _all_ configured users to filter by. 690 am.deleteUnusedRemoteUserState(ctx, allUsers) 691 } 692 693 return nil 694 } 695 696 func (am *MultitenantAlertmanager) waitInitialStateSync(ctx context.Context) error { 697 am.alertmanagersMtx.Lock() 698 ams := make([]*Alertmanager, 0, len(am.alertmanagers)) 699 for _, userAM := range am.alertmanagers { 700 ams = append(ams, userAM) 701 } 702 am.alertmanagersMtx.Unlock() 703 704 for _, userAM := range ams { 705 if err := userAM.WaitInitialStateSync(ctx); err != nil { 706 return err 707 } 708 } 709 710 return nil 711 } 712 713 // stopping runs when MultitenantAlertmanager transitions to Stopping state. 714 func (am *MultitenantAlertmanager) stopping(_ error) error { 715 am.alertmanagersMtx.Lock() 716 for _, am := range am.alertmanagers { 717 am.StopAndWait() 718 } 719 am.alertmanagersMtx.Unlock() 720 if am.peer != nil { // Tests don't setup any peer. 721 err := am.peer.Leave(am.cfg.Cluster.PeerTimeout) 722 if err != nil { 723 level.Warn(am.logger).Log("msg", "failed to leave the cluster", "err", err) 724 } 725 } 726 727 if am.subservices != nil { 728 // subservices manages ring and lifecycler, if sharding was enabled. 729 _ = services.StopManagerAndAwaitStopped(context.Background(), am.subservices) 730 } 731 return nil 732 } 733 734 // loadAlertmanagerConfigs Loads (and filters) the alertmanagers configuration from object storage, taking into consideration the sharding strategy. Returns: 735 // - The list of discovered users (all users with a configuration in storage) 736 // - The configurations of users owned by this instance. 737 func (am *MultitenantAlertmanager) loadAlertmanagerConfigs(ctx context.Context) ([]string, map[string]alertspb.AlertConfigDesc, error) { 738 // Find all users with an alertmanager config. 739 allUserIDs, err := am.store.ListAllUsers(ctx) 740 if err != nil { 741 return nil, nil, errors.Wrap(err, "failed to list users with alertmanager configuration") 742 } 743 numUsersDiscovered := len(allUserIDs) 744 ownedUserIDs := make([]string, 0, len(allUserIDs)) 745 746 // Filter out users not owned by this shard. 747 for _, userID := range allUserIDs { 748 if am.isUserOwned(userID) { 749 ownedUserIDs = append(ownedUserIDs, userID) 750 } 751 } 752 numUsersOwned := len(ownedUserIDs) 753 754 // Load the configs for the owned users. 755 configs, err := am.store.GetAlertConfigs(ctx, ownedUserIDs) 756 if err != nil { 757 return nil, nil, errors.Wrapf(err, "failed to load alertmanager configurations for owned users") 758 } 759 760 am.tenantsDiscovered.Set(float64(numUsersDiscovered)) 761 am.tenantsOwned.Set(float64(numUsersOwned)) 762 return allUserIDs, configs, nil 763 } 764 765 func (am *MultitenantAlertmanager) isUserOwned(userID string) bool { 766 // If sharding is disabled, any alertmanager instance owns all users. 767 if !am.cfg.ShardingEnabled { 768 return true 769 } 770 771 alertmanagers, err := am.ring.Get(shardByUser(userID), SyncRingOp, nil, nil, nil) 772 if err != nil { 773 am.ringCheckErrors.Inc() 774 level.Error(am.logger).Log("msg", "failed to load alertmanager configuration", "user", userID, "err", err) 775 return false 776 } 777 778 return alertmanagers.Includes(am.ringLifecycler.GetInstanceAddr()) 779 } 780 781 func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alertspb.AlertConfigDesc) { 782 level.Debug(am.logger).Log("msg", "adding configurations", "num_configs", len(cfgs)) 783 for user, cfg := range cfgs { 784 err := am.setConfig(cfg) 785 if err != nil { 786 am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(0)) 787 level.Warn(am.logger).Log("msg", "error applying config", "err", err) 788 continue 789 } 790 791 am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(1)) 792 am.multitenantMetrics.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime() 793 } 794 795 userAlertmanagersToStop := map[string]*Alertmanager{} 796 797 am.alertmanagersMtx.Lock() 798 for userID, userAM := range am.alertmanagers { 799 if _, exists := cfgs[userID]; !exists { 800 userAlertmanagersToStop[userID] = userAM 801 delete(am.alertmanagers, userID) 802 delete(am.cfgs, userID) 803 am.multitenantMetrics.lastReloadSuccessful.DeleteLabelValues(userID) 804 am.multitenantMetrics.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID) 805 am.alertmanagerMetrics.removeUserRegistry(userID) 806 } 807 } 808 am.alertmanagersMtx.Unlock() 809 810 // Now stop alertmanagers and wait until they are really stopped, without holding lock. 811 for userID, userAM := range userAlertmanagersToStop { 812 level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", userID) 813 userAM.StopAndWait() 814 level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", userID) 815 } 816 } 817 818 // setConfig applies the given configuration to the alertmanager for `userID`, 819 // creating an alertmanager if it doesn't already exist. 820 func (am *MultitenantAlertmanager) setConfig(cfg alertspb.AlertConfigDesc) error { 821 var userAmConfig *amconfig.Config 822 var err error 823 var hasTemplateChanges bool 824 var userTemplateDir = filepath.Join(am.getTenantDirectory(cfg.User), templatesDir) 825 var pathsToRemove = make(map[string]struct{}) 826 827 // List existing files to keep track the ones to be removed 828 if oldTemplateFiles, err := ioutil.ReadDir(userTemplateDir); err == nil { 829 for _, file := range oldTemplateFiles { 830 pathsToRemove[filepath.Join(userTemplateDir, file.Name())] = struct{}{} 831 } 832 } 833 834 for _, tmpl := range cfg.Templates { 835 templateFilePath, err := safeTemplateFilepath(userTemplateDir, tmpl.Filename) 836 if err != nil { 837 return err 838 } 839 840 // Removing from pathsToRemove map the files that still exists in the config 841 delete(pathsToRemove, templateFilePath) 842 hasChanged, err := storeTemplateFile(templateFilePath, tmpl.Body) 843 if err != nil { 844 return err 845 } 846 847 if hasChanged { 848 hasTemplateChanges = true 849 } 850 } 851 852 for pathToRemove := range pathsToRemove { 853 err := os.Remove(pathToRemove) 854 if err != nil { 855 level.Warn(am.logger).Log("msg", "failed to remove file", "file", pathToRemove, "err", err) 856 } 857 hasTemplateChanges = true 858 } 859 860 level.Debug(am.logger).Log("msg", "setting config", "user", cfg.User) 861 862 am.alertmanagersMtx.Lock() 863 defer am.alertmanagersMtx.Unlock() 864 existing, hasExisting := am.alertmanagers[cfg.User] 865 866 rawCfg := cfg.RawConfig 867 if cfg.RawConfig == "" { 868 if am.fallbackConfig == "" { 869 return fmt.Errorf("blank Alertmanager configuration for %v", cfg.User) 870 } 871 level.Debug(am.logger).Log("msg", "blank Alertmanager configuration; using fallback", "user", cfg.User) 872 userAmConfig, err = amconfig.Load(am.fallbackConfig) 873 if err != nil { 874 return fmt.Errorf("unable to load fallback configuration for %v: %v", cfg.User, err) 875 } 876 rawCfg = am.fallbackConfig 877 } else { 878 userAmConfig, err = amconfig.Load(cfg.RawConfig) 879 if err != nil && hasExisting { 880 // This means that if a user has a working config and 881 // they submit a broken one, the Manager will keep running the last known 882 // working configuration. 883 return fmt.Errorf("invalid Cortex configuration for %v: %v", cfg.User, err) 884 } 885 } 886 887 // We can have an empty configuration here if: 888 // 1) the user had a previous alertmanager 889 // 2) then, submitted a non-working configuration (and we kept running the prev working config) 890 // 3) finally, the cortex AM instance is restarted and the running version is no longer present 891 if userAmConfig == nil { 892 return fmt.Errorf("no usable Alertmanager configuration for %v", cfg.User) 893 } 894 895 // Transform webhook configs URLs to the per tenant monitor 896 if am.cfg.AutoWebhookRoot != "" { 897 for i, r := range userAmConfig.Receivers { 898 for j, w := range r.WebhookConfigs { 899 if w.URL.String() == autoWebhookURL { 900 u, err := url.Parse(am.cfg.AutoWebhookRoot + "/" + cfg.User + "/monitor") 901 if err != nil { 902 return err 903 } 904 905 userAmConfig.Receivers[i].WebhookConfigs[j].URL = &amconfig.URL{URL: u} 906 } 907 } 908 } 909 } 910 911 // If no Alertmanager instance exists for this user yet, start one. 912 if !hasExisting { 913 level.Debug(am.logger).Log("msg", "initializing new per-tenant alertmanager", "user", cfg.User) 914 newAM, err := am.newAlertmanager(cfg.User, userAmConfig, rawCfg) 915 if err != nil { 916 return err 917 } 918 am.alertmanagers[cfg.User] = newAM 919 } else if am.cfgs[cfg.User].RawConfig != cfg.RawConfig || hasTemplateChanges { 920 level.Info(am.logger).Log("msg", "updating new per-tenant alertmanager", "user", cfg.User) 921 // If the config changed, apply the new one. 922 err := existing.ApplyConfig(cfg.User, userAmConfig, rawCfg) 923 if err != nil { 924 return fmt.Errorf("unable to apply Alertmanager config for user %v: %v", cfg.User, err) 925 } 926 } 927 928 am.cfgs[cfg.User] = cfg 929 return nil 930 } 931 932 func (am *MultitenantAlertmanager) getTenantDirectory(userID string) string { 933 return filepath.Join(am.cfg.DataDir, userID) 934 } 935 936 func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amconfig.Config, rawCfg string) (*Alertmanager, error) { 937 reg := prometheus.NewRegistry() 938 939 tenantDir := am.getTenantDirectory(userID) 940 err := os.MkdirAll(tenantDir, 0777) 941 if err != nil { 942 return nil, errors.Wrapf(err, "failed to create per-tenant directory %v", tenantDir) 943 } 944 945 newAM, err := New(&Config{ 946 UserID: userID, 947 TenantDataDir: tenantDir, 948 Logger: am.logger, 949 Peer: am.peer, 950 PeerTimeout: am.cfg.Cluster.PeerTimeout, 951 Retention: am.cfg.Retention, 952 ExternalURL: am.cfg.ExternalURL.URL, 953 ShardingEnabled: am.cfg.ShardingEnabled, 954 Replicator: am, 955 ReplicationFactor: am.cfg.ShardingRing.ReplicationFactor, 956 Store: am.store, 957 PersisterConfig: am.cfg.Persister, 958 Limits: am.limits, 959 }, reg) 960 if err != nil { 961 return nil, fmt.Errorf("unable to start Alertmanager for user %v: %v", userID, err) 962 } 963 964 if err := newAM.ApplyConfig(userID, amConfig, rawCfg); err != nil { 965 return nil, fmt.Errorf("unable to apply initial config for user %v: %v", userID, err) 966 } 967 968 am.alertmanagerMetrics.addUserRegistry(userID, reg) 969 return newAM, nil 970 } 971 972 // GetPositionForUser returns the position this Alertmanager instance holds in the ring related to its other replicas for an specific user. 973 func (am *MultitenantAlertmanager) GetPositionForUser(userID string) int { 974 // If we have a replication factor of 1 or less we don't need to do any work and can immediately return. 975 if am.ring == nil || am.ring.ReplicationFactor() <= 1 { 976 return 0 977 } 978 979 set, err := am.ring.Get(shardByUser(userID), RingOp, nil, nil, nil) 980 if err != nil { 981 level.Error(am.logger).Log("msg", "unable to read the ring while trying to determine the alertmanager position", "err", err) 982 // If we're unable to determine the position, we don't want a tenant to miss out on the notification - instead, 983 // just assume we're the first in line and run the risk of a double notification. 984 return 0 985 } 986 987 var position int 988 for i, instance := range set.Instances { 989 if instance.Addr == am.ringLifecycler.GetInstanceAddr() { 990 position = i 991 break 992 } 993 } 994 995 return position 996 } 997 998 // ServeHTTP serves the Alertmanager's web UI and API. 999 func (am *MultitenantAlertmanager) ServeHTTP(w http.ResponseWriter, req *http.Request) { 1000 if am.State() != services.Running { 1001 http.Error(w, "Alertmanager not ready", http.StatusServiceUnavailable) 1002 return 1003 } 1004 1005 if am.cfg.ShardingEnabled && am.distributor.IsPathSupported(req.URL.Path) { 1006 am.distributor.DistributeRequest(w, req) 1007 return 1008 } 1009 1010 // If sharding is not enabled or Distributor does not support this path, 1011 // it is served by this instance. 1012 am.serveRequest(w, req) 1013 } 1014 1015 // HandleRequest implements gRPC Alertmanager service, which receives request from AlertManager-Distributor. 1016 func (am *MultitenantAlertmanager) HandleRequest(ctx context.Context, in *httpgrpc.HTTPRequest) (*httpgrpc.HTTPResponse, error) { 1017 return am.grpcServer.Handle(ctx, in) 1018 } 1019 1020 // serveRequest serves the Alertmanager's web UI and API. 1021 func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http.Request) { 1022 userID, err := tenant.TenantID(req.Context()) 1023 if err != nil { 1024 http.Error(w, err.Error(), http.StatusUnauthorized) 1025 return 1026 } 1027 am.alertmanagersMtx.Lock() 1028 userAM, ok := am.alertmanagers[userID] 1029 am.alertmanagersMtx.Unlock() 1030 1031 if ok { 1032 userAM.mux.ServeHTTP(w, req) 1033 return 1034 } 1035 1036 if am.fallbackConfig != "" { 1037 userAM, err = am.alertmanagerFromFallbackConfig(userID) 1038 if err != nil { 1039 level.Error(am.logger).Log("msg", "unable to initialize the Alertmanager with a fallback configuration", "user", userID, "err", err) 1040 http.Error(w, "Failed to initialize the Alertmanager", http.StatusInternalServerError) 1041 return 1042 } 1043 1044 userAM.mux.ServeHTTP(w, req) 1045 return 1046 } 1047 1048 level.Debug(am.logger).Log("msg", "the Alertmanager has no configuration and no fallback specified", "user", userID) 1049 http.Error(w, "the Alertmanager is not configured", http.StatusNotFound) 1050 } 1051 1052 func (am *MultitenantAlertmanager) alertmanagerFromFallbackConfig(userID string) (*Alertmanager, error) { 1053 // Upload an empty config so that the Alertmanager is no de-activated in the next poll 1054 cfgDesc := alertspb.ToProto("", nil, userID) 1055 err := am.store.SetAlertConfig(context.Background(), cfgDesc) 1056 if err != nil { 1057 return nil, err 1058 } 1059 1060 // Calling setConfig with an empty configuration will use the fallback config. 1061 err = am.setConfig(cfgDesc) 1062 if err != nil { 1063 return nil, err 1064 } 1065 1066 am.alertmanagersMtx.Lock() 1067 defer am.alertmanagersMtx.Unlock() 1068 return am.alertmanagers[userID], nil 1069 } 1070 1071 // ReplicateStateForUser attempts to replicate a partial state sent by an alertmanager to its other replicas through the ring. 1072 func (am *MultitenantAlertmanager) ReplicateStateForUser(ctx context.Context, userID string, part *clusterpb.Part) error { 1073 level.Debug(am.logger).Log("msg", "message received for replication", "user", userID, "key", part.Key) 1074 1075 selfAddress := am.ringLifecycler.GetInstanceAddr() 1076 err := ring.DoBatch(ctx, RingOp, am.ring, []uint32{shardByUser(userID)}, func(desc ring.InstanceDesc, _ []int) error { 1077 if desc.GetAddr() == selfAddress { 1078 return nil 1079 } 1080 1081 c, err := am.alertmanagerClientsPool.GetClientFor(desc.GetAddr()) 1082 if err != nil { 1083 return err 1084 } 1085 1086 resp, err := c.UpdateState(user.InjectOrgID(ctx, userID), part) 1087 if err != nil { 1088 return err 1089 } 1090 1091 switch resp.Status { 1092 case alertmanagerpb.MERGE_ERROR: 1093 level.Error(am.logger).Log("msg", "state replication failed", "user", userID, "key", part.Key, "err", resp.Error) 1094 case alertmanagerpb.USER_NOT_FOUND: 1095 level.Debug(am.logger).Log("msg", "user not found while trying to replicate state", "user", userID, "key", part.Key) 1096 } 1097 return nil 1098 }, func() {}) 1099 1100 return err 1101 } 1102 1103 // ReadFullStateForUser attempts to read the full state from each replica for user. Note that it will try to obtain and return 1104 // state from all replicas, but will consider it a success if state is obtained from at least one replica. 1105 func (am *MultitenantAlertmanager) ReadFullStateForUser(ctx context.Context, userID string) ([]*clusterpb.FullState, error) { 1106 // Only get the set of replicas which contain the specified user. 1107 key := shardByUser(userID) 1108 replicationSet, err := am.ring.Get(key, RingOp, nil, nil, nil) 1109 if err != nil { 1110 return nil, err 1111 } 1112 1113 // We should only query state from other replicas, and not our own state. 1114 addrs := replicationSet.GetAddressesWithout(am.ringLifecycler.GetInstanceAddr()) 1115 1116 var ( 1117 resultsMtx sync.Mutex 1118 results []*clusterpb.FullState 1119 ) 1120 1121 // Note that the jobs swallow the errors - this is because we want to give each replica a chance to respond. 1122 jobs := concurrency.CreateJobsFromStrings(addrs) 1123 err = concurrency.ForEach(ctx, jobs, len(jobs), func(ctx context.Context, job interface{}) error { 1124 addr := job.(string) 1125 level.Debug(am.logger).Log("msg", "contacting replica for full state", "user", userID, "addr", addr) 1126 1127 c, err := am.alertmanagerClientsPool.GetClientFor(addr) 1128 if err != nil { 1129 level.Error(am.logger).Log("msg", "failed to get rpc client", "err", err) 1130 return nil 1131 } 1132 1133 resp, err := c.ReadState(user.InjectOrgID(ctx, userID), &alertmanagerpb.ReadStateRequest{}) 1134 if err != nil { 1135 level.Error(am.logger).Log("msg", "rpc reading state from replica failed", "addr", addr, "user", userID, "err", err) 1136 return nil 1137 } 1138 1139 switch resp.Status { 1140 case alertmanagerpb.READ_OK: 1141 resultsMtx.Lock() 1142 results = append(results, resp.State) 1143 resultsMtx.Unlock() 1144 case alertmanagerpb.READ_ERROR: 1145 level.Error(am.logger).Log("msg", "error trying to read state", "addr", addr, "user", userID, "err", resp.Error) 1146 case alertmanagerpb.READ_USER_NOT_FOUND: 1147 level.Debug(am.logger).Log("msg", "user not found while trying to read state", "addr", addr, "user", userID) 1148 default: 1149 level.Error(am.logger).Log("msg", "unknown response trying to read state", "addr", addr, "user", userID) 1150 } 1151 return nil 1152 }) 1153 if err != nil { 1154 return nil, err 1155 } 1156 1157 // We only require the state from a single replica, though we return as many as we were able to obtain. 1158 if len(results) == 0 { 1159 return nil, fmt.Errorf("failed to read state from any replica") 1160 } 1161 1162 return results, nil 1163 } 1164 1165 // UpdateState implements the Alertmanager service. 1166 func (am *MultitenantAlertmanager) UpdateState(ctx context.Context, part *clusterpb.Part) (*alertmanagerpb.UpdateStateResponse, error) { 1167 userID, err := tenant.TenantID(ctx) 1168 if err != nil { 1169 return nil, err 1170 } 1171 1172 am.alertmanagersMtx.Lock() 1173 userAM, ok := am.alertmanagers[userID] 1174 am.alertmanagersMtx.Unlock() 1175 1176 if !ok { 1177 // We can end up trying to replicate state to an alertmanager that is no longer available due to e.g. a ring topology change. 1178 level.Debug(am.logger).Log("msg", "user does not have an alertmanager in this instance", "user", userID) 1179 return &alertmanagerpb.UpdateStateResponse{ 1180 Status: alertmanagerpb.USER_NOT_FOUND, 1181 Error: "alertmanager for this user does not exists", 1182 }, nil 1183 } 1184 1185 if err = userAM.mergePartialExternalState(part); err != nil { 1186 return &alertmanagerpb.UpdateStateResponse{ 1187 Status: alertmanagerpb.MERGE_ERROR, 1188 Error: err.Error(), 1189 }, nil 1190 } 1191 1192 return &alertmanagerpb.UpdateStateResponse{Status: alertmanagerpb.OK}, nil 1193 } 1194 1195 // deleteUnusedRemoteUserState deletes state objects in remote storage for users that are no longer configured. 1196 func (am *MultitenantAlertmanager) deleteUnusedRemoteUserState(ctx context.Context, allUsers []string) { 1197 1198 users := make(map[string]struct{}, len(allUsers)) 1199 for _, userID := range allUsers { 1200 users[userID] = struct{}{} 1201 } 1202 1203 usersWithState, err := am.store.ListUsersWithFullState(ctx) 1204 if err != nil { 1205 level.Warn(am.logger).Log("msg", "failed to list users with state", "err", err) 1206 return 1207 } 1208 1209 for _, userID := range usersWithState { 1210 if _, ok := users[userID]; ok { 1211 continue 1212 } 1213 1214 err := am.store.DeleteFullState(ctx, userID) 1215 if err != nil { 1216 level.Warn(am.logger).Log("msg", "failed to delete remote state for user", "user", userID, "err", err) 1217 } else { 1218 level.Info(am.logger).Log("msg", "deleted remote state for user", "user", userID) 1219 } 1220 } 1221 } 1222 1223 // deleteUnusedLocalUserState deletes local files for users that we no longer need. 1224 func (am *MultitenantAlertmanager) deleteUnusedLocalUserState() { 1225 userDirs := am.getPerUserDirectories() 1226 1227 // And delete remaining files. 1228 for userID, dir := range userDirs { 1229 am.alertmanagersMtx.Lock() 1230 userAM := am.alertmanagers[userID] 1231 am.alertmanagersMtx.Unlock() 1232 1233 // Don't delete directory if AM for user still exists. 1234 if userAM != nil { 1235 continue 1236 } 1237 1238 err := os.RemoveAll(dir) 1239 if err != nil { 1240 level.Warn(am.logger).Log("msg", "failed to delete directory for user", "dir", dir, "user", userID, "err", err) 1241 } else { 1242 level.Info(am.logger).Log("msg", "deleted local directory for user", "dir", dir, "user", userID) 1243 } 1244 } 1245 } 1246 1247 // getPerUserDirectories returns map of users to their directories (full path). Only users with local 1248 // directory are returned. 1249 func (am *MultitenantAlertmanager) getPerUserDirectories() map[string]string { 1250 files, err := ioutil.ReadDir(am.cfg.DataDir) 1251 if err != nil { 1252 level.Warn(am.logger).Log("msg", "failed to list local dir", "dir", am.cfg.DataDir, "err", err) 1253 return nil 1254 } 1255 1256 result := map[string]string{} 1257 1258 for _, f := range files { 1259 fullPath := filepath.Join(am.cfg.DataDir, f.Name()) 1260 1261 if !f.IsDir() { 1262 level.Warn(am.logger).Log("msg", "ignoring unexpected file while scanning local alertmanager configs", "file", fullPath) 1263 continue 1264 } 1265 1266 result[f.Name()] = fullPath 1267 } 1268 return result 1269 } 1270 1271 // UpdateState implements the Alertmanager service. 1272 func (am *MultitenantAlertmanager) ReadState(ctx context.Context, req *alertmanagerpb.ReadStateRequest) (*alertmanagerpb.ReadStateResponse, error) { 1273 userID, err := tenant.TenantID(ctx) 1274 if err != nil { 1275 return nil, err 1276 } 1277 1278 am.alertmanagersMtx.Lock() 1279 userAM, ok := am.alertmanagers[userID] 1280 am.alertmanagersMtx.Unlock() 1281 1282 if !ok { 1283 level.Debug(am.logger).Log("msg", "user does not have an alertmanager in this instance", "user", userID) 1284 return &alertmanagerpb.ReadStateResponse{ 1285 Status: alertmanagerpb.READ_USER_NOT_FOUND, 1286 Error: "alertmanager for this user does not exists", 1287 }, nil 1288 } 1289 1290 state, err := userAM.getFullState() 1291 if err != nil { 1292 return &alertmanagerpb.ReadStateResponse{ 1293 Status: alertmanagerpb.READ_ERROR, 1294 Error: err.Error(), 1295 }, nil 1296 } 1297 1298 return &alertmanagerpb.ReadStateResponse{ 1299 Status: alertmanagerpb.READ_OK, 1300 State: state, 1301 }, nil 1302 } 1303 1304 // validateTemplateFilename validated the template filename and returns error if it's not valid. 1305 // The validation done in this function is a first fence to avoid having a tenant submitting 1306 // a config which may escape the per-tenant data directory on disk. 1307 func validateTemplateFilename(filename string) error { 1308 if filepath.Base(filename) != filename { 1309 return fmt.Errorf("invalid template name %q: the template name cannot contain any path", filename) 1310 } 1311 1312 // Further enforce no path in the template name. 1313 if filepath.Dir(filepath.Clean(filename)) != "." { 1314 return fmt.Errorf("invalid template name %q: the template name cannot contain any path", filename) 1315 } 1316 1317 return nil 1318 } 1319 1320 // safeTemplateFilepath builds and return the template filepath within the provided dir. 1321 // This function also performs a security check to make sure the provided templateName 1322 // doesn't contain a relative path escaping the provided dir. 1323 func safeTemplateFilepath(dir, templateName string) (string, error) { 1324 // We expect all template files to be stored and referenced within the provided directory. 1325 containerDir, err := filepath.Abs(dir) 1326 if err != nil { 1327 return "", err 1328 } 1329 1330 // Build the actual path of the template. 1331 actualPath, err := filepath.Abs(filepath.Join(containerDir, templateName)) 1332 if err != nil { 1333 return "", err 1334 } 1335 1336 // Ensure the actual path of the template is within the expected directory. 1337 // This check is a counter-measure to make sure the tenant is not trying to 1338 // escape its own directory on disk. 1339 if !strings.HasPrefix(actualPath, containerDir) { 1340 return "", fmt.Errorf("invalid template name %q: the template filepath is escaping the per-tenant local directory", templateName) 1341 } 1342 1343 return actualPath, nil 1344 } 1345 1346 // storeTemplateFile stores template file at the given templateFilepath. 1347 // Returns true, if file content has changed (new or updated file), false if file with the same name 1348 // and content was already stored locally. 1349 func storeTemplateFile(templateFilepath, content string) (bool, error) { 1350 // Make sure the directory exists. 1351 dir := filepath.Dir(templateFilepath) 1352 err := os.MkdirAll(dir, 0755) 1353 if err != nil { 1354 return false, fmt.Errorf("unable to create Alertmanager templates directory %q: %s", dir, err) 1355 } 1356 1357 // Check if the template file already exists and if it has changed 1358 if tmpl, err := ioutil.ReadFile(templateFilepath); err == nil && string(tmpl) == content { 1359 return false, nil 1360 } else if err != nil && !os.IsNotExist(err) { 1361 return false, err 1362 } 1363 1364 if err := ioutil.WriteFile(templateFilepath, []byte(content), 0644); err != nil { 1365 return false, fmt.Errorf("unable to create Alertmanager template file %q: %s", templateFilepath, err) 1366 } 1367 1368 return true, nil 1369 }