github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/alertmanager/multitenant_test.go (about) 1 package alertmanager 2 3 import ( 4 "bytes" 5 "context" 6 "encoding/json" 7 "errors" 8 "fmt" 9 "io/ioutil" 10 "math/rand" 11 "net/http" 12 "net/http/httptest" 13 "net/http/pprof" 14 "os" 15 "path/filepath" 16 "regexp" 17 "strings" 18 "sync" 19 "testing" 20 "time" 21 22 "github.com/go-kit/log" 23 "github.com/grafana/dskit/concurrency" 24 "github.com/grafana/dskit/flagext" 25 "github.com/grafana/dskit/kv/consul" 26 "github.com/grafana/dskit/ring" 27 "github.com/grafana/dskit/services" 28 "github.com/prometheus/alertmanager/cluster/clusterpb" 29 "github.com/prometheus/alertmanager/notify" 30 "github.com/prometheus/alertmanager/pkg/labels" 31 "github.com/prometheus/alertmanager/types" 32 "github.com/prometheus/client_golang/prometheus" 33 "github.com/prometheus/client_golang/prometheus/testutil" 34 "github.com/prometheus/common/model" 35 "github.com/stretchr/testify/assert" 36 "github.com/stretchr/testify/require" 37 "github.com/thanos-io/thanos/pkg/objstore" 38 "github.com/weaveworks/common/httpgrpc" 39 "github.com/weaveworks/common/user" 40 "go.uber.org/atomic" 41 "golang.org/x/time/rate" 42 "google.golang.org/grpc" 43 44 "github.com/cortexproject/cortex/pkg/alertmanager/alertmanagerpb" 45 "github.com/cortexproject/cortex/pkg/alertmanager/alertspb" 46 "github.com/cortexproject/cortex/pkg/alertmanager/alertstore" 47 "github.com/cortexproject/cortex/pkg/alertmanager/alertstore/bucketclient" 48 "github.com/cortexproject/cortex/pkg/storage/bucket" 49 "github.com/cortexproject/cortex/pkg/util" 50 "github.com/cortexproject/cortex/pkg/util/test" 51 "github.com/cortexproject/cortex/pkg/util/validation" 52 ) 53 54 var ( 55 simpleConfigOne = `route: 56 receiver: dummy 57 58 receivers: 59 - name: dummy` 60 61 simpleConfigTwo = `route: 62 receiver: dummy 63 64 receivers: 65 - name: dummy` 66 ) 67 68 func mockAlertmanagerConfig(t *testing.T) *MultitenantAlertmanagerConfig { 69 t.Helper() 70 71 externalURL := flagext.URLValue{} 72 err := externalURL.Set("http://localhost/api/prom") 73 require.NoError(t, err) 74 75 tempDir, err := ioutil.TempDir(os.TempDir(), "alertmanager") 76 require.NoError(t, err) 77 78 t.Cleanup(func() { 79 err := os.RemoveAll(tempDir) 80 require.NoError(t, err) 81 }) 82 83 cfg := &MultitenantAlertmanagerConfig{} 84 flagext.DefaultValues(cfg) 85 86 cfg.ExternalURL = externalURL 87 cfg.DataDir = tempDir 88 cfg.ShardingRing.InstanceID = "test" 89 cfg.ShardingRing.InstanceAddr = "127.0.0.1" 90 cfg.PollInterval = time.Minute 91 92 return cfg 93 } 94 95 func TestMultitenantAlertmanagerConfig_Validate(t *testing.T) { 96 tests := map[string]struct { 97 setup func(t *testing.T, cfg *MultitenantAlertmanagerConfig, storageCfg *alertstore.Config) 98 expected error 99 }{ 100 "should pass with default config": { 101 setup: func(t *testing.T, cfg *MultitenantAlertmanagerConfig, storageCfg *alertstore.Config) {}, 102 expected: nil, 103 }, 104 "should fail if persistent interval is 0": { 105 setup: func(t *testing.T, cfg *MultitenantAlertmanagerConfig, storageCfg *alertstore.Config) { 106 cfg.Persister.Interval = 0 107 }, 108 expected: errInvalidPersistInterval, 109 }, 110 "should fail if persistent interval is negative": { 111 setup: func(t *testing.T, cfg *MultitenantAlertmanagerConfig, storageCfg *alertstore.Config) { 112 cfg.Persister.Interval = -1 113 }, 114 expected: errInvalidPersistInterval, 115 }, 116 "should fail if external URL ends with /": { 117 setup: func(t *testing.T, cfg *MultitenantAlertmanagerConfig, storageCfg *alertstore.Config) { 118 require.NoError(t, cfg.ExternalURL.Set("http://localhost/prefix/")) 119 }, 120 expected: errInvalidExternalURL, 121 }, 122 "should succeed if external URL does not end with /": { 123 setup: func(t *testing.T, cfg *MultitenantAlertmanagerConfig, storageCfg *alertstore.Config) { 124 require.NoError(t, cfg.ExternalURL.Set("http://localhost/prefix")) 125 }, 126 expected: nil, 127 }, 128 "should succeed if sharding enabled and new storage configuration given with bucket client": { 129 setup: func(t *testing.T, cfg *MultitenantAlertmanagerConfig, storageCfg *alertstore.Config) { 130 cfg.ShardingEnabled = true 131 storageCfg.Backend = "s3" 132 }, 133 expected: nil, 134 }, 135 "should fail if sharding enabled and new storage store configuration given with local type": { 136 setup: func(t *testing.T, cfg *MultitenantAlertmanagerConfig, storageCfg *alertstore.Config) { 137 cfg.ShardingEnabled = true 138 storageCfg.Backend = "local" 139 }, 140 expected: errShardingUnsupportedStorage, 141 }, 142 "should fail if sharding enabled and new storage store configuration given with configdb type": { 143 setup: func(t *testing.T, cfg *MultitenantAlertmanagerConfig, storageCfg *alertstore.Config) { 144 cfg.ShardingEnabled = true 145 storageCfg.Backend = "configdb" 146 }, 147 expected: errShardingUnsupportedStorage, 148 }, 149 "should fail if sharding enabled and legacy store configuration given": { 150 setup: func(t *testing.T, cfg *MultitenantAlertmanagerConfig, storageCfg *alertstore.Config) { 151 cfg.ShardingEnabled = true 152 cfg.Store.Type = "s3" 153 }, 154 expected: errShardingLegacyStorage, 155 }, 156 "should fail if zone aware is enabled but zone is not set": { 157 setup: func(t *testing.T, cfg *MultitenantAlertmanagerConfig, storageCfg *alertstore.Config) { 158 cfg.ShardingEnabled = true 159 cfg.ShardingRing.ZoneAwarenessEnabled = true 160 }, 161 expected: errZoneAwarenessEnabledWithoutZoneInfo, 162 }, 163 } 164 165 for testName, testData := range tests { 166 t.Run(testName, func(t *testing.T) { 167 cfg := &MultitenantAlertmanagerConfig{} 168 storageCfg := alertstore.Config{} 169 flagext.DefaultValues(cfg) 170 flagext.DefaultValues(&storageCfg) 171 testData.setup(t, cfg, &storageCfg) 172 assert.Equal(t, testData.expected, cfg.Validate(storageCfg)) 173 }) 174 } 175 } 176 177 func TestMultitenantAlertmanager_loadAndSyncConfigs(t *testing.T) { 178 ctx := context.Background() 179 180 // Run this test using a real storage client. 181 store := prepareInMemoryAlertStore() 182 require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 183 User: "user1", 184 RawConfig: simpleConfigOne, 185 Templates: []*alertspb.TemplateDesc{}, 186 })) 187 require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 188 User: "user2", 189 RawConfig: simpleConfigOne, 190 Templates: []*alertspb.TemplateDesc{}, 191 })) 192 193 reg := prometheus.NewPedanticRegistry() 194 cfg := mockAlertmanagerConfig(t) 195 am, err := createMultitenantAlertmanager(cfg, nil, nil, store, nil, nil, log.NewNopLogger(), reg) 196 require.NoError(t, err) 197 198 // Ensure the configs are synced correctly 199 err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) 200 require.NoError(t, err) 201 require.Len(t, am.alertmanagers, 2) 202 203 currentConfig, cfgExists := am.cfgs["user1"] 204 require.True(t, cfgExists) 205 require.Equal(t, simpleConfigOne, currentConfig.RawConfig) 206 207 assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` 208 # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. 209 # TYPE cortex_alertmanager_config_last_reload_successful gauge 210 cortex_alertmanager_config_last_reload_successful{user="user1"} 1 211 cortex_alertmanager_config_last_reload_successful{user="user2"} 1 212 `), "cortex_alertmanager_config_last_reload_successful")) 213 214 // Ensure when a 3rd config is added, it is synced correctly 215 user3Cfg := alertspb.AlertConfigDesc{ 216 User: "user3", 217 RawConfig: simpleConfigOne + ` 218 templates: 219 - 'first.tpl' 220 - 'second.tpl' 221 `, 222 Templates: []*alertspb.TemplateDesc{ 223 { 224 Filename: "first.tpl", 225 Body: `{{ define "t1" }}Template 1 ... {{end}}`, 226 }, 227 { 228 Filename: "second.tpl", 229 Body: `{{ define "t2" }}Template 2{{ end}}`, 230 }, 231 }, 232 } 233 require.NoError(t, store.SetAlertConfig(ctx, user3Cfg)) 234 235 err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) 236 require.NoError(t, err) 237 require.Len(t, am.alertmanagers, 3) 238 239 dirs := am.getPerUserDirectories() 240 user3Dir := dirs["user3"] 241 require.NotZero(t, user3Dir) 242 require.True(t, dirExists(t, user3Dir)) 243 require.True(t, dirExists(t, filepath.Join(user3Dir, templatesDir))) 244 require.True(t, fileExists(t, filepath.Join(user3Dir, templatesDir, "first.tpl"))) 245 require.True(t, fileExists(t, filepath.Join(user3Dir, templatesDir, "second.tpl"))) 246 247 assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` 248 # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. 249 # TYPE cortex_alertmanager_config_last_reload_successful gauge 250 cortex_alertmanager_config_last_reload_successful{user="user1"} 1 251 cortex_alertmanager_config_last_reload_successful{user="user2"} 1 252 cortex_alertmanager_config_last_reload_successful{user="user3"} 1 253 `), "cortex_alertmanager_config_last_reload_successful")) 254 255 // Ensure the config is updated 256 require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 257 User: "user1", 258 RawConfig: simpleConfigTwo, 259 Templates: []*alertspb.TemplateDesc{}, 260 })) 261 262 err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) 263 require.NoError(t, err) 264 265 currentConfig, cfgExists = am.cfgs["user1"] 266 require.True(t, cfgExists) 267 require.Equal(t, simpleConfigTwo, currentConfig.RawConfig) 268 269 // Test Delete User, ensure config is removed and the resources are freed. 270 require.NoError(t, store.DeleteAlertConfig(ctx, "user3")) 271 err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) 272 require.NoError(t, err) 273 currentConfig, cfgExists = am.cfgs["user3"] 274 require.False(t, cfgExists) 275 require.Equal(t, "", currentConfig.RawConfig) 276 277 _, cfgExists = am.alertmanagers["user3"] 278 require.False(t, cfgExists) 279 dirs = am.getPerUserDirectories() 280 require.NotZero(t, dirs["user1"]) 281 require.NotZero(t, dirs["user2"]) 282 require.Zero(t, dirs["user3"]) // User3 is deleted, so we should have no more files for it. 283 require.False(t, fileExists(t, user3Dir)) 284 285 assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` 286 # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. 287 # TYPE cortex_alertmanager_config_last_reload_successful gauge 288 cortex_alertmanager_config_last_reload_successful{user="user1"} 1 289 cortex_alertmanager_config_last_reload_successful{user="user2"} 1 290 `), "cortex_alertmanager_config_last_reload_successful")) 291 292 // Ensure when a 3rd config is re-added, it is synced correctly 293 require.NoError(t, store.SetAlertConfig(ctx, user3Cfg)) 294 295 err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) 296 require.NoError(t, err) 297 298 currentConfig, cfgExists = am.cfgs["user3"] 299 require.True(t, cfgExists) 300 require.Equal(t, user3Cfg.RawConfig, currentConfig.RawConfig) 301 302 _, cfgExists = am.alertmanagers["user3"] 303 require.True(t, cfgExists) 304 dirs = am.getPerUserDirectories() 305 require.NotZero(t, dirs["user1"]) 306 require.NotZero(t, dirs["user2"]) 307 require.Equal(t, user3Dir, dirs["user3"]) // Dir should exist, even though state files are not generated yet. 308 309 // Hierarchy that existed before should exist again. 310 require.True(t, dirExists(t, user3Dir)) 311 require.True(t, dirExists(t, filepath.Join(user3Dir, templatesDir))) 312 require.True(t, fileExists(t, filepath.Join(user3Dir, templatesDir, "first.tpl"))) 313 require.True(t, fileExists(t, filepath.Join(user3Dir, templatesDir, "second.tpl"))) 314 315 assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` 316 # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. 317 # TYPE cortex_alertmanager_config_last_reload_successful gauge 318 cortex_alertmanager_config_last_reload_successful{user="user1"} 1 319 cortex_alertmanager_config_last_reload_successful{user="user2"} 1 320 cortex_alertmanager_config_last_reload_successful{user="user3"} 1 321 `), "cortex_alertmanager_config_last_reload_successful")) 322 323 // Removed template files should be cleaned up 324 user3Cfg.Templates = []*alertspb.TemplateDesc{ 325 { 326 Filename: "first.tpl", 327 Body: `{{ define "t1" }}Template 1 ... {{end}}`, 328 }, 329 } 330 331 require.NoError(t, store.SetAlertConfig(ctx, user3Cfg)) 332 333 err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) 334 require.NoError(t, err) 335 336 require.True(t, dirExists(t, user3Dir)) 337 require.True(t, fileExists(t, filepath.Join(user3Dir, templatesDir, "first.tpl"))) 338 require.False(t, fileExists(t, filepath.Join(user3Dir, templatesDir, "second.tpl"))) 339 } 340 341 func TestMultitenantAlertmanager_FirewallShouldBlockHTTPBasedReceiversWhenEnabled(t *testing.T) { 342 tests := map[string]struct { 343 getAlertmanagerConfig func(backendURL string) string 344 }{ 345 "webhook": { 346 getAlertmanagerConfig: func(backendURL string) string { 347 return fmt.Sprintf(` 348 route: 349 receiver: webhook 350 group_wait: 0s 351 group_interval: 1s 352 353 receivers: 354 - name: webhook 355 webhook_configs: 356 - url: %s 357 `, backendURL) 358 }, 359 }, 360 "pagerduty": { 361 getAlertmanagerConfig: func(backendURL string) string { 362 return fmt.Sprintf(` 363 route: 364 receiver: pagerduty 365 group_wait: 0s 366 group_interval: 1s 367 368 receivers: 369 - name: pagerduty 370 pagerduty_configs: 371 - url: %s 372 routing_key: secret 373 `, backendURL) 374 }, 375 }, 376 "slack": { 377 getAlertmanagerConfig: func(backendURL string) string { 378 return fmt.Sprintf(` 379 route: 380 receiver: slack 381 group_wait: 0s 382 group_interval: 1s 383 384 receivers: 385 - name: slack 386 slack_configs: 387 - api_url: %s 388 channel: test 389 `, backendURL) 390 }, 391 }, 392 "opsgenie": { 393 getAlertmanagerConfig: func(backendURL string) string { 394 return fmt.Sprintf(` 395 route: 396 receiver: opsgenie 397 group_wait: 0s 398 group_interval: 1s 399 400 receivers: 401 - name: opsgenie 402 opsgenie_configs: 403 - api_url: %s 404 api_key: secret 405 `, backendURL) 406 }, 407 }, 408 "wechat": { 409 getAlertmanagerConfig: func(backendURL string) string { 410 return fmt.Sprintf(` 411 route: 412 receiver: wechat 413 group_wait: 0s 414 group_interval: 1s 415 416 receivers: 417 - name: wechat 418 wechat_configs: 419 - api_url: %s 420 api_secret: secret 421 corp_id: babycorp 422 `, backendURL) 423 }, 424 }, 425 } 426 427 for receiverName, testData := range tests { 428 for _, firewallEnabled := range []bool{true, false} { 429 receiverName := receiverName 430 testData := testData 431 firewallEnabled := firewallEnabled 432 433 t.Run(fmt.Sprintf("receiver=%s firewall enabled=%v", receiverName, firewallEnabled), func(t *testing.T) { 434 t.Parallel() 435 436 ctx := context.Background() 437 userID := "user-1" 438 serverInvoked := atomic.NewBool(false) 439 440 // Create a local HTTP server to test whether the request is received. 441 server := httptest.NewServer(http.HandlerFunc(func(writer http.ResponseWriter, request *http.Request) { 442 serverInvoked.Store(true) 443 writer.WriteHeader(http.StatusOK) 444 })) 445 defer server.Close() 446 447 // Create the alertmanager config. 448 alertmanagerCfg := testData.getAlertmanagerConfig(fmt.Sprintf("http://%s", server.Listener.Addr().String())) 449 450 // Store the alertmanager config in the bucket. 451 store := prepareInMemoryAlertStore() 452 require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 453 User: userID, 454 RawConfig: alertmanagerCfg, 455 })) 456 457 // Prepare the alertmanager config. 458 cfg := mockAlertmanagerConfig(t) 459 460 // Prepare the limits config. 461 var limits validation.Limits 462 flagext.DefaultValues(&limits) 463 limits.AlertmanagerReceiversBlockPrivateAddresses = firewallEnabled 464 465 overrides, err := validation.NewOverrides(limits, nil) 466 require.NoError(t, err) 467 468 // Start the alertmanager. 469 reg := prometheus.NewPedanticRegistry() 470 logs := &concurrency.SyncBuffer{} 471 logger := log.NewLogfmtLogger(logs) 472 am, err := createMultitenantAlertmanager(cfg, nil, nil, store, nil, overrides, logger, reg) 473 require.NoError(t, err) 474 require.NoError(t, services.StartAndAwaitRunning(ctx, am)) 475 t.Cleanup(func() { 476 require.NoError(t, services.StopAndAwaitTerminated(ctx, am)) 477 }) 478 479 // Ensure the configs are synced correctly. 480 assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` 481 # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. 482 # TYPE cortex_alertmanager_config_last_reload_successful gauge 483 cortex_alertmanager_config_last_reload_successful{user="user-1"} 1 484 `), "cortex_alertmanager_config_last_reload_successful")) 485 486 // Create an alert to push. 487 alerts := types.Alerts(&types.Alert{ 488 Alert: model.Alert{ 489 Labels: map[model.LabelName]model.LabelValue{model.AlertNameLabel: "test"}, 490 StartsAt: time.Now().Add(-time.Minute), 491 EndsAt: time.Now().Add(time.Minute), 492 }, 493 UpdatedAt: time.Now(), 494 Timeout: false, 495 }) 496 497 alertsPayload, err := json.Marshal(alerts) 498 require.NoError(t, err) 499 500 // Push an alert. 501 req := httptest.NewRequest(http.MethodPost, cfg.ExternalURL.String()+"/api/v1/alerts", bytes.NewReader(alertsPayload)) 502 req.Header.Set("content-type", "application/json") 503 reqCtx := user.InjectOrgID(req.Context(), userID) 504 { 505 w := httptest.NewRecorder() 506 am.ServeHTTP(w, req.WithContext(reqCtx)) 507 508 resp := w.Result() 509 _, err := ioutil.ReadAll(resp.Body) 510 require.NoError(t, err) 511 assert.Equal(t, http.StatusOK, w.Code) 512 } 513 514 // Ensure the server endpoint has not been called if firewall is enabled. Since the alert is delivered 515 // asynchronously, we should pool it for a short period. 516 deadline := time.Now().Add(3 * time.Second) 517 for { 518 if time.Now().After(deadline) || serverInvoked.Load() { 519 break 520 } 521 time.Sleep(100 * time.Millisecond) 522 } 523 524 assert.Equal(t, !firewallEnabled, serverInvoked.Load()) 525 526 // Print all alertmanager logs to have more information if this test fails in CI. 527 t.Logf("Alertmanager logs:\n%s", logs.String()) 528 }) 529 } 530 } 531 } 532 533 func TestMultitenantAlertmanager_migrateStateFilesToPerTenantDirectories(t *testing.T) { 534 ctx := context.Background() 535 536 const ( 537 user1 = "user1" 538 user2 = "user2" 539 ) 540 541 store := prepareInMemoryAlertStore() 542 require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 543 User: user2, 544 RawConfig: simpleConfigOne, 545 Templates: []*alertspb.TemplateDesc{}, 546 })) 547 548 reg := prometheus.NewPedanticRegistry() 549 cfg := mockAlertmanagerConfig(t) 550 am, err := createMultitenantAlertmanager(cfg, nil, nil, store, nil, nil, log.NewNopLogger(), reg) 551 require.NoError(t, err) 552 553 createFile(t, filepath.Join(cfg.DataDir, "nflog:"+user1)) 554 createFile(t, filepath.Join(cfg.DataDir, "silences:"+user1)) 555 createFile(t, filepath.Join(cfg.DataDir, "nflog:"+user2)) 556 createFile(t, filepath.Join(cfg.DataDir, "templates", user2, "template.tpl")) 557 558 require.NoError(t, am.migrateStateFilesToPerTenantDirectories()) 559 require.True(t, fileExists(t, filepath.Join(cfg.DataDir, user1, notificationLogSnapshot))) 560 require.True(t, fileExists(t, filepath.Join(cfg.DataDir, user1, silencesSnapshot))) 561 require.True(t, fileExists(t, filepath.Join(cfg.DataDir, user2, notificationLogSnapshot))) 562 require.True(t, dirExists(t, filepath.Join(cfg.DataDir, user2, templatesDir))) 563 require.True(t, fileExists(t, filepath.Join(cfg.DataDir, user2, templatesDir, "template.tpl"))) 564 } 565 566 func fileExists(t *testing.T, path string) bool { 567 return checkExists(t, path, false) 568 } 569 570 func dirExists(t *testing.T, path string) bool { 571 return checkExists(t, path, true) 572 } 573 574 func checkExists(t *testing.T, path string, dir bool) bool { 575 fi, err := os.Stat(path) 576 if err != nil { 577 if os.IsNotExist(err) { 578 return false 579 } 580 require.NoError(t, err) 581 } 582 583 require.Equal(t, dir, fi.IsDir()) 584 return true 585 } 586 587 func TestMultitenantAlertmanager_deleteUnusedLocalUserState(t *testing.T) { 588 ctx := context.Background() 589 590 const ( 591 user1 = "user1" 592 user2 = "user2" 593 ) 594 595 store := prepareInMemoryAlertStore() 596 require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 597 User: user2, 598 RawConfig: simpleConfigOne, 599 Templates: []*alertspb.TemplateDesc{}, 600 })) 601 602 reg := prometheus.NewPedanticRegistry() 603 cfg := mockAlertmanagerConfig(t) 604 am, err := createMultitenantAlertmanager(cfg, nil, nil, store, nil, nil, log.NewNopLogger(), reg) 605 require.NoError(t, err) 606 607 createFile(t, filepath.Join(cfg.DataDir, user1, notificationLogSnapshot)) 608 createFile(t, filepath.Join(cfg.DataDir, user1, silencesSnapshot)) 609 createFile(t, filepath.Join(cfg.DataDir, user2, notificationLogSnapshot)) 610 createFile(t, filepath.Join(cfg.DataDir, user2, templatesDir, "template.tpl")) 611 612 dirs := am.getPerUserDirectories() 613 require.Equal(t, 2, len(dirs)) 614 require.NotZero(t, dirs[user1]) 615 require.NotZero(t, dirs[user2]) 616 617 // Ensure the configs are synced correctly 618 err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) 619 require.NoError(t, err) 620 621 // loadAndSyncConfigs also cleans up obsolete files. Let's verify that. 622 dirs = am.getPerUserDirectories() 623 624 require.Zero(t, dirs[user1]) // has no configuration, files were deleted 625 require.NotZero(t, dirs[user2]) // has config, files survived 626 } 627 628 func TestMultitenantAlertmanager_zoneAwareSharding(t *testing.T) { 629 ctx := context.Background() 630 alertStore := prepareInMemoryAlertStore() 631 ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) 632 t.Cleanup(func() { assert.NoError(t, closer.Close()) }) 633 634 const ( 635 user1 = "user1" 636 user2 = "user2" 637 user3 = "user3" 638 ) 639 640 createInstance := func(i int, zone string, registries *util.UserRegistries) *MultitenantAlertmanager { 641 reg := prometheus.NewPedanticRegistry() 642 cfg := mockAlertmanagerConfig(t) 643 instanceID := fmt.Sprintf("instance-%d", i) 644 registries.AddUserRegistry(instanceID, reg) 645 646 cfg.ShardingRing.ReplicationFactor = 2 647 cfg.ShardingRing.InstanceID = instanceID 648 cfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.1-%d", i) 649 cfg.ShardingEnabled = true 650 cfg.ShardingRing.ZoneAwarenessEnabled = true 651 cfg.ShardingRing.InstanceZone = zone 652 653 am, err := createMultitenantAlertmanager(cfg, nil, nil, alertStore, ringStore, nil, log.NewLogfmtLogger(os.Stdout), reg) 654 require.NoError(t, err) 655 t.Cleanup(func() { 656 require.NoError(t, services.StopAndAwaitTerminated(ctx, am)) 657 }) 658 require.NoError(t, services.StartAndAwaitRunning(ctx, am)) 659 660 return am 661 } 662 663 registriesZoneA := util.NewUserRegistries() 664 registriesZoneB := util.NewUserRegistries() 665 666 am1ZoneA := createInstance(1, "zoneA", registriesZoneA) 667 am2ZoneA := createInstance(2, "zoneA", registriesZoneA) 668 am1ZoneB := createInstance(3, "zoneB", registriesZoneB) 669 670 { 671 require.NoError(t, alertStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 672 User: user1, 673 RawConfig: simpleConfigOne, 674 Templates: []*alertspb.TemplateDesc{}, 675 })) 676 require.NoError(t, alertStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 677 User: user2, 678 RawConfig: simpleConfigOne, 679 Templates: []*alertspb.TemplateDesc{}, 680 })) 681 require.NoError(t, alertStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 682 User: user3, 683 RawConfig: simpleConfigOne, 684 Templates: []*alertspb.TemplateDesc{}, 685 })) 686 687 err := am1ZoneA.loadAndSyncConfigs(context.Background(), reasonPeriodic) 688 require.NoError(t, err) 689 err = am2ZoneA.loadAndSyncConfigs(context.Background(), reasonPeriodic) 690 require.NoError(t, err) 691 err = am1ZoneB.loadAndSyncConfigs(context.Background(), reasonPeriodic) 692 require.NoError(t, err) 693 } 694 695 metricsZoneA := registriesZoneA.BuildMetricFamiliesPerUser() 696 metricsZoneB := registriesZoneB.BuildMetricFamiliesPerUser() 697 698 assert.Equal(t, float64(3), metricsZoneA.GetSumOfGauges("cortex_alertmanager_tenants_owned")) 699 assert.Equal(t, float64(3), metricsZoneB.GetSumOfGauges("cortex_alertmanager_tenants_owned")) 700 } 701 702 func TestMultitenantAlertmanager_deleteUnusedRemoteUserState(t *testing.T) { 703 ctx := context.Background() 704 705 const ( 706 user1 = "user1" 707 user2 = "user2" 708 ) 709 710 alertStore := prepareInMemoryAlertStore() 711 ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) 712 t.Cleanup(func() { assert.NoError(t, closer.Close()) }) 713 714 createInstance := func(i int) *MultitenantAlertmanager { 715 reg := prometheus.NewPedanticRegistry() 716 cfg := mockAlertmanagerConfig(t) 717 718 cfg.ShardingRing.ReplicationFactor = 1 719 cfg.ShardingRing.InstanceID = fmt.Sprintf("instance-%d", i) 720 cfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.1-%d", i) 721 cfg.ShardingEnabled = true 722 723 // Increase state write interval so that state gets written sooner, making test faster. 724 cfg.Persister.Interval = 500 * time.Millisecond 725 726 am, err := createMultitenantAlertmanager(cfg, nil, nil, alertStore, ringStore, nil, log.NewLogfmtLogger(os.Stdout), reg) 727 require.NoError(t, err) 728 t.Cleanup(func() { 729 require.NoError(t, services.StopAndAwaitTerminated(ctx, am)) 730 }) 731 require.NoError(t, services.StartAndAwaitRunning(ctx, am)) 732 733 return am 734 } 735 736 // Create two instances. With replication factor of 1, this means that only one 737 // of the instances will own the user. This tests that an instance does not delete 738 // state for users that are configured, but are owned by other instances. 739 am1 := createInstance(1) 740 am2 := createInstance(2) 741 742 // Configure the users and wait for the state persister to write some state for both. 743 { 744 require.NoError(t, alertStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 745 User: user1, 746 RawConfig: simpleConfigOne, 747 Templates: []*alertspb.TemplateDesc{}, 748 })) 749 require.NoError(t, alertStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 750 User: user2, 751 RawConfig: simpleConfigOne, 752 Templates: []*alertspb.TemplateDesc{}, 753 })) 754 755 err := am1.loadAndSyncConfigs(context.Background(), reasonPeriodic) 756 require.NoError(t, err) 757 err = am2.loadAndSyncConfigs(context.Background(), reasonPeriodic) 758 require.NoError(t, err) 759 760 require.Eventually(t, func() bool { 761 _, err1 := alertStore.GetFullState(context.Background(), user1) 762 _, err2 := alertStore.GetFullState(context.Background(), user2) 763 return err1 == nil && err2 == nil 764 }, 5*time.Second, 100*time.Millisecond, "timed out waiting for state to be persisted") 765 } 766 767 // Perform another sync to trigger cleanup; this should have no effect. 768 { 769 err := am1.loadAndSyncConfigs(context.Background(), reasonPeriodic) 770 require.NoError(t, err) 771 err = am2.loadAndSyncConfigs(context.Background(), reasonPeriodic) 772 require.NoError(t, err) 773 774 _, err = alertStore.GetFullState(context.Background(), user1) 775 require.NoError(t, err) 776 _, err = alertStore.GetFullState(context.Background(), user2) 777 require.NoError(t, err) 778 } 779 780 // Delete one configuration and trigger cleanup; state for only that user should be deleted. 781 { 782 require.NoError(t, alertStore.DeleteAlertConfig(ctx, user1)) 783 784 err := am1.loadAndSyncConfigs(context.Background(), reasonPeriodic) 785 require.NoError(t, err) 786 err = am2.loadAndSyncConfigs(context.Background(), reasonPeriodic) 787 require.NoError(t, err) 788 789 _, err = alertStore.GetFullState(context.Background(), user1) 790 require.Equal(t, alertspb.ErrNotFound, err) 791 _, err = alertStore.GetFullState(context.Background(), user2) 792 require.NoError(t, err) 793 } 794 } 795 796 func createFile(t *testing.T, path string) string { 797 dir := filepath.Dir(path) 798 require.NoError(t, os.MkdirAll(dir, 0777)) 799 f, err := os.Create(path) 800 require.NoError(t, err) 801 require.NoError(t, f.Close()) 802 return path 803 } 804 805 func TestMultitenantAlertmanager_NoExternalURL(t *testing.T) { 806 amConfig := mockAlertmanagerConfig(t) 807 amConfig.ExternalURL = flagext.URLValue{} // no external URL 808 809 // Create the Multitenant Alertmanager. 810 reg := prometheus.NewPedanticRegistry() 811 _, err := NewMultitenantAlertmanager(amConfig, nil, nil, log.NewNopLogger(), reg) 812 813 require.EqualError(t, err, "unable to create Alertmanager because the external URL has not been configured") 814 } 815 816 func TestMultitenantAlertmanager_ServeHTTP(t *testing.T) { 817 // Run this test using a real storage client. 818 store := prepareInMemoryAlertStore() 819 820 amConfig := mockAlertmanagerConfig(t) 821 822 externalURL := flagext.URLValue{} 823 err := externalURL.Set("http://localhost:8080/alertmanager") 824 require.NoError(t, err) 825 826 amConfig.ExternalURL = externalURL 827 828 // Create the Multitenant Alertmanager. 829 reg := prometheus.NewPedanticRegistry() 830 am, err := createMultitenantAlertmanager(amConfig, nil, nil, store, nil, nil, log.NewNopLogger(), reg) 831 require.NoError(t, err) 832 833 require.NoError(t, services.StartAndAwaitRunning(context.Background(), am)) 834 defer services.StopAndAwaitTerminated(context.Background(), am) //nolint:errcheck 835 836 // Request when no user configuration is present. 837 req := httptest.NewRequest("GET", externalURL.String(), nil) 838 ctx := user.InjectOrgID(req.Context(), "user1") 839 840 { 841 w := httptest.NewRecorder() 842 am.ServeHTTP(w, req.WithContext(ctx)) 843 844 resp := w.Result() 845 body, _ := ioutil.ReadAll(resp.Body) 846 require.Equal(t, 404, w.Code) 847 require.Equal(t, "the Alertmanager is not configured\n", string(body)) 848 } 849 850 // Create a configuration for the user in storage. 851 require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 852 User: "user1", 853 RawConfig: simpleConfigTwo, 854 Templates: []*alertspb.TemplateDesc{}, 855 })) 856 857 // Make the alertmanager pick it up. 858 err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) 859 require.NoError(t, err) 860 861 // Request when AM is active. 862 { 863 w := httptest.NewRecorder() 864 am.ServeHTTP(w, req.WithContext(ctx)) 865 866 require.Equal(t, 301, w.Code) // redirect to UI 867 } 868 869 // Verify that GET /metrics returns 404 even when AM is active. 870 { 871 metricURL := externalURL.String() + "/metrics" 872 require.Equal(t, "http://localhost:8080/alertmanager/metrics", metricURL) 873 verify404(ctx, t, am, "GET", metricURL) 874 } 875 876 // Verify that POST /-/reload returns 404 even when AM is active. 877 { 878 metricURL := externalURL.String() + "/-/reload" 879 require.Equal(t, "http://localhost:8080/alertmanager/-/reload", metricURL) 880 verify404(ctx, t, am, "POST", metricURL) 881 } 882 883 // Verify that GET /debug/index returns 404 even when AM is active. 884 { 885 // Register pprof Index (under non-standard path, but this path is exposed by AM using default MUX!) 886 http.HandleFunc("/alertmanager/debug/index", pprof.Index) 887 888 metricURL := externalURL.String() + "/debug/index" 889 require.Equal(t, "http://localhost:8080/alertmanager/debug/index", metricURL) 890 verify404(ctx, t, am, "GET", metricURL) 891 } 892 893 // Remove the tenant's Alertmanager 894 require.NoError(t, store.DeleteAlertConfig(ctx, "user1")) 895 err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) 896 require.NoError(t, err) 897 898 { 899 // Request when the alertmanager is gone 900 w := httptest.NewRecorder() 901 am.ServeHTTP(w, req.WithContext(ctx)) 902 903 resp := w.Result() 904 body, _ := ioutil.ReadAll(resp.Body) 905 require.Equal(t, 404, w.Code) 906 require.Equal(t, "the Alertmanager is not configured\n", string(body)) 907 } 908 } 909 910 func verify404(ctx context.Context, t *testing.T, am *MultitenantAlertmanager, method string, url string) { 911 metricsReq := httptest.NewRequest(method, url, strings.NewReader("Hello")) // Body for POST Request. 912 w := httptest.NewRecorder() 913 am.ServeHTTP(w, metricsReq.WithContext(ctx)) 914 915 require.Equal(t, 404, w.Code) 916 } 917 918 func TestMultitenantAlertmanager_ServeHTTPWithFallbackConfig(t *testing.T) { 919 ctx := context.Background() 920 amConfig := mockAlertmanagerConfig(t) 921 922 // Run this test using a real storage client. 923 store := prepareInMemoryAlertStore() 924 925 externalURL := flagext.URLValue{} 926 err := externalURL.Set("http://localhost:8080/alertmanager") 927 require.NoError(t, err) 928 929 fallbackCfg := ` 930 global: 931 smtp_smarthost: 'localhost:25' 932 smtp_from: 'youraddress@example.org' 933 route: 934 receiver: example-email 935 receivers: 936 - name: example-email 937 email_configs: 938 - to: 'youraddress@example.org' 939 ` 940 amConfig.ExternalURL = externalURL 941 942 // Create the Multitenant Alertmanager. 943 am, err := createMultitenantAlertmanager(amConfig, nil, nil, store, nil, nil, log.NewNopLogger(), nil) 944 require.NoError(t, err) 945 am.fallbackConfig = fallbackCfg 946 947 require.NoError(t, services.StartAndAwaitRunning(ctx, am)) 948 defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck 949 950 // Request when no user configuration is present. 951 req := httptest.NewRequest("GET", externalURL.String()+"/api/v1/status", nil) 952 w := httptest.NewRecorder() 953 954 am.ServeHTTP(w, req.WithContext(user.InjectOrgID(req.Context(), "user1"))) 955 956 resp := w.Result() 957 958 // It succeeds and the Alertmanager is started. 959 require.Equal(t, http.StatusOK, resp.StatusCode) 960 require.Len(t, am.alertmanagers, 1) 961 _, exists := am.alertmanagers["user1"] 962 require.True(t, exists) 963 964 // Even after a poll... 965 err = am.loadAndSyncConfigs(ctx, reasonPeriodic) 966 require.NoError(t, err) 967 968 // It does not remove the Alertmanager. 969 require.Len(t, am.alertmanagers, 1) 970 _, exists = am.alertmanagers["user1"] 971 require.True(t, exists) 972 973 // Remove the Alertmanager configuration. 974 require.NoError(t, store.DeleteAlertConfig(ctx, "user1")) 975 err = am.loadAndSyncConfigs(ctx, reasonPeriodic) 976 require.NoError(t, err) 977 978 // Even after removing it.. We start it again with the fallback configuration. 979 w = httptest.NewRecorder() 980 am.ServeHTTP(w, req.WithContext(user.InjectOrgID(req.Context(), "user1"))) 981 982 resp = w.Result() 983 require.Equal(t, http.StatusOK, resp.StatusCode) 984 } 985 986 func TestMultitenantAlertmanager_InitialSyncWithSharding(t *testing.T) { 987 tc := []struct { 988 name string 989 existing bool 990 initialState ring.InstanceState 991 initialTokens ring.Tokens 992 }{ 993 { 994 name: "with no instance in the ring", 995 existing: false, 996 }, 997 { 998 name: "with an instance already in the ring with PENDING state and no tokens", 999 existing: true, 1000 initialState: ring.PENDING, 1001 initialTokens: ring.Tokens{}, 1002 }, 1003 { 1004 name: "with an instance already in the ring with JOINING state and some tokens", 1005 existing: true, 1006 initialState: ring.JOINING, 1007 initialTokens: ring.Tokens{1, 2, 3, 4, 5, 6, 7, 8, 9}, 1008 }, 1009 { 1010 name: "with an instance already in the ring with ACTIVE state and all tokens", 1011 existing: true, 1012 initialState: ring.ACTIVE, 1013 initialTokens: ring.GenerateTokens(128, nil), 1014 }, 1015 { 1016 name: "with an instance already in the ring with LEAVING state and all tokens", 1017 existing: true, 1018 initialState: ring.LEAVING, 1019 initialTokens: ring.Tokens{100000}, 1020 }, 1021 } 1022 1023 for _, tt := range tc { 1024 t.Run(tt.name, func(t *testing.T) { 1025 ctx := context.Background() 1026 amConfig := mockAlertmanagerConfig(t) 1027 amConfig.ShardingEnabled = true 1028 ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) 1029 t.Cleanup(func() { assert.NoError(t, closer.Close()) }) 1030 1031 // Use an alert store with a mocked backend. 1032 bkt := &bucket.ClientMock{} 1033 alertStore := bucketclient.NewBucketAlertStore(bkt, nil, log.NewNopLogger()) 1034 1035 // Setup the initial instance state in the ring. 1036 if tt.existing { 1037 require.NoError(t, ringStore.CAS(ctx, RingKey, func(in interface{}) (interface{}, bool, error) { 1038 ringDesc := ring.GetOrCreateRingDesc(in) 1039 ringDesc.AddIngester(amConfig.ShardingRing.InstanceID, amConfig.ShardingRing.InstanceAddr, "", tt.initialTokens, tt.initialState, time.Now()) 1040 return ringDesc, true, nil 1041 })) 1042 } 1043 1044 am, err := createMultitenantAlertmanager(amConfig, nil, nil, alertStore, ringStore, nil, log.NewNopLogger(), nil) 1045 require.NoError(t, err) 1046 defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck 1047 1048 // Before being registered in the ring. 1049 require.False(t, am.ringLifecycler.IsRegistered()) 1050 require.Equal(t, ring.PENDING.String(), am.ringLifecycler.GetState().String()) 1051 require.Equal(t, 0, len(am.ringLifecycler.GetTokens())) 1052 require.Equal(t, ring.Tokens{}, am.ringLifecycler.GetTokens()) 1053 1054 // During the initial sync, we expect two things. That the instance is already 1055 // registered with the ring (meaning we have tokens) and that its state is JOINING. 1056 bkt.MockIterWithCallback("alerts/", nil, nil, func() { 1057 require.True(t, am.ringLifecycler.IsRegistered()) 1058 require.Equal(t, ring.JOINING.String(), am.ringLifecycler.GetState().String()) 1059 }) 1060 bkt.MockIter("alertmanager/", nil, nil) 1061 1062 // Once successfully started, the instance should be ACTIVE in the ring. 1063 require.NoError(t, services.StartAndAwaitRunning(ctx, am)) 1064 1065 // After being registered in the ring. 1066 require.True(t, am.ringLifecycler.IsRegistered()) 1067 require.Equal(t, ring.ACTIVE.String(), am.ringLifecycler.GetState().String()) 1068 require.Equal(t, 128, len(am.ringLifecycler.GetTokens())) 1069 require.Subset(t, am.ringLifecycler.GetTokens(), tt.initialTokens) 1070 }) 1071 } 1072 } 1073 1074 func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) { 1075 tc := []struct { 1076 name string 1077 tenantShardSize int 1078 replicationFactor int 1079 instances int 1080 configs int 1081 expectedTenants int 1082 withSharding bool 1083 }{ 1084 { 1085 name: "sharding disabled, 1 instance", 1086 instances: 1, 1087 configs: 10, 1088 expectedTenants: 10, 1089 }, 1090 { 1091 name: "sharding disabled, 2 instances", 1092 instances: 2, 1093 configs: 10, 1094 expectedTenants: 10 * 2, // each instance loads _all_ tenants. 1095 }, 1096 { 1097 name: "sharding enabled, 1 instance, RF = 1", 1098 withSharding: true, 1099 instances: 1, 1100 replicationFactor: 1, 1101 configs: 10, 1102 expectedTenants: 10, // same as no sharding and 1 instance 1103 }, 1104 { 1105 name: "sharding enabled, 2 instances, RF = 1", 1106 withSharding: true, 1107 instances: 2, 1108 replicationFactor: 1, 1109 configs: 10, 1110 expectedTenants: 10, // configs * replication factor 1111 }, 1112 { 1113 name: "sharding enabled, 3 instances, RF = 2", 1114 withSharding: true, 1115 instances: 3, 1116 replicationFactor: 2, 1117 configs: 10, 1118 expectedTenants: 20, // configs * replication factor 1119 }, 1120 { 1121 name: "sharding enabled, 5 instances, RF = 3", 1122 withSharding: true, 1123 instances: 5, 1124 replicationFactor: 3, 1125 configs: 10, 1126 expectedTenants: 30, // configs * replication factor 1127 }, 1128 } 1129 1130 for _, tt := range tc { 1131 t.Run(tt.name, func(t *testing.T) { 1132 ctx := context.Background() 1133 ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) 1134 t.Cleanup(func() { assert.NoError(t, closer.Close()) }) 1135 1136 alertStore := prepareInMemoryAlertStore() 1137 1138 var instances []*MultitenantAlertmanager 1139 var instanceIDs []string 1140 registries := util.NewUserRegistries() 1141 1142 // First, add the number of configs to the store. 1143 for i := 1; i <= tt.configs; i++ { 1144 u := fmt.Sprintf("u-%d", i) 1145 require.NoError(t, alertStore.SetAlertConfig(context.Background(), alertspb.AlertConfigDesc{ 1146 User: u, 1147 RawConfig: simpleConfigOne, 1148 Templates: []*alertspb.TemplateDesc{}, 1149 })) 1150 } 1151 1152 // Then, create the alertmanager instances, start them and add their registries to the slice. 1153 for i := 1; i <= tt.instances; i++ { 1154 instanceIDs = append(instanceIDs, fmt.Sprintf("alertmanager-%d", i)) 1155 instanceID := fmt.Sprintf("alertmanager-%d", i) 1156 1157 amConfig := mockAlertmanagerConfig(t) 1158 amConfig.ShardingRing.ReplicationFactor = tt.replicationFactor 1159 amConfig.ShardingRing.InstanceID = instanceID 1160 amConfig.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i) 1161 // Do not check the ring topology changes or poll in an interval in this test (we explicitly sync alertmanagers). 1162 amConfig.PollInterval = time.Hour 1163 amConfig.ShardingRing.RingCheckPeriod = time.Hour 1164 1165 if tt.withSharding { 1166 amConfig.ShardingEnabled = true 1167 } 1168 1169 reg := prometheus.NewPedanticRegistry() 1170 am, err := createMultitenantAlertmanager(amConfig, nil, nil, alertStore, ringStore, nil, log.NewNopLogger(), reg) 1171 require.NoError(t, err) 1172 defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck 1173 1174 require.NoError(t, services.StartAndAwaitRunning(ctx, am)) 1175 1176 instances = append(instances, am) 1177 instanceIDs = append(instanceIDs, instanceID) 1178 registries.AddUserRegistry(instanceID, reg) 1179 } 1180 1181 // If we're testing sharding, we need make sure the ring is settled. 1182 if tt.withSharding { 1183 ctx, cancel := context.WithTimeout(ctx, 5*time.Second) 1184 defer cancel() 1185 1186 // The alertmanager is ready to be tested once all instances are ACTIVE and the ring settles. 1187 for _, am := range instances { 1188 for _, id := range instanceIDs { 1189 require.NoError(t, ring.WaitInstanceState(ctx, am.ring, id, ring.ACTIVE)) 1190 } 1191 } 1192 } 1193 1194 // Now that the ring has settled, sync configs with the instances. 1195 var numConfigs, numInstances int 1196 for _, am := range instances { 1197 err := am.loadAndSyncConfigs(ctx, reasonRingChange) 1198 require.NoError(t, err) 1199 numConfigs += len(am.cfgs) 1200 numInstances += len(am.alertmanagers) 1201 } 1202 1203 metrics := registries.BuildMetricFamiliesPerUser() 1204 assert.Equal(t, tt.expectedTenants, numConfigs) 1205 assert.Equal(t, tt.expectedTenants, numInstances) 1206 assert.Equal(t, float64(tt.expectedTenants), metrics.GetSumOfGauges("cortex_alertmanager_tenants_owned")) 1207 assert.Equal(t, float64(tt.configs*tt.instances), metrics.GetSumOfGauges("cortex_alertmanager_tenants_discovered")) 1208 }) 1209 } 1210 } 1211 1212 func TestMultitenantAlertmanager_SyncOnRingTopologyChanges(t *testing.T) { 1213 registeredAt := time.Now() 1214 1215 tc := []struct { 1216 name string 1217 setupRing func(desc *ring.Desc) 1218 updateRing func(desc *ring.Desc) 1219 expected bool 1220 }{ 1221 { 1222 name: "when an instance is added to the ring", 1223 setupRing: func(desc *ring.Desc) { 1224 desc.AddIngester("alertmanager-1", "127.0.0.1", "", ring.Tokens{1, 2, 3}, ring.ACTIVE, registeredAt) 1225 }, 1226 updateRing: func(desc *ring.Desc) { 1227 desc.AddIngester("alertmanager-2", "127.0.0.2", "", ring.Tokens{4, 5, 6}, ring.ACTIVE, registeredAt) 1228 }, 1229 expected: true, 1230 }, 1231 { 1232 name: "when an instance is removed from the ring", 1233 setupRing: func(desc *ring.Desc) { 1234 desc.AddIngester("alertmanager-1", "127.0.0.1", "", ring.Tokens{1, 2, 3}, ring.ACTIVE, registeredAt) 1235 desc.AddIngester("alertmanager-2", "127.0.0.2", "", ring.Tokens{4, 5, 6}, ring.ACTIVE, registeredAt) 1236 }, 1237 updateRing: func(desc *ring.Desc) { 1238 desc.RemoveIngester("alertmanager-1") 1239 }, 1240 expected: true, 1241 }, 1242 { 1243 name: "should sync when an instance changes state", 1244 setupRing: func(desc *ring.Desc) { 1245 desc.AddIngester("alertmanager-1", "127.0.0.1", "", ring.Tokens{1, 2, 3}, ring.ACTIVE, registeredAt) 1246 desc.AddIngester("alertmanager-2", "127.0.0.2", "", ring.Tokens{4, 5, 6}, ring.JOINING, registeredAt) 1247 }, 1248 updateRing: func(desc *ring.Desc) { 1249 instance := desc.Ingesters["alertmanager-2"] 1250 instance.State = ring.ACTIVE 1251 desc.Ingesters["alertmanager-2"] = instance 1252 }, 1253 expected: true, 1254 }, 1255 { 1256 name: "should sync when an healthy instance becomes unhealthy", 1257 setupRing: func(desc *ring.Desc) { 1258 desc.AddIngester("alertmanager-1", "127.0.0.1", "", ring.Tokens{1, 2, 3}, ring.ACTIVE, registeredAt) 1259 desc.AddIngester("alertmanager-2", "127.0.0.2", "", ring.Tokens{4, 5, 6}, ring.ACTIVE, registeredAt) 1260 }, 1261 updateRing: func(desc *ring.Desc) { 1262 instance := desc.Ingesters["alertmanager-1"] 1263 instance.Timestamp = time.Now().Add(-time.Hour).Unix() 1264 desc.Ingesters["alertmanager-1"] = instance 1265 }, 1266 expected: true, 1267 }, 1268 { 1269 name: "should sync when an unhealthy instance becomes healthy", 1270 setupRing: func(desc *ring.Desc) { 1271 desc.AddIngester("alertmanager-1", "127.0.0.1", "", ring.Tokens{1, 2, 3}, ring.ACTIVE, registeredAt) 1272 1273 instance := desc.AddIngester("alertmanager-2", "127.0.0.2", "", ring.Tokens{4, 5, 6}, ring.ACTIVE, registeredAt) 1274 instance.Timestamp = time.Now().Add(-time.Hour).Unix() 1275 desc.Ingesters["alertmanager-2"] = instance 1276 }, 1277 updateRing: func(desc *ring.Desc) { 1278 instance := desc.Ingesters["alertmanager-2"] 1279 instance.Timestamp = time.Now().Unix() 1280 desc.Ingesters["alertmanager-2"] = instance 1281 }, 1282 expected: true, 1283 }, 1284 { 1285 name: "should NOT sync when an instance updates the heartbeat", 1286 setupRing: func(desc *ring.Desc) { 1287 desc.AddIngester("alertmanager-1", "127.0.0.1", "", ring.Tokens{1, 2, 3}, ring.ACTIVE, registeredAt) 1288 desc.AddIngester("alertmanager-2", "127.0.0.2", "", ring.Tokens{4, 5, 6}, ring.ACTIVE, registeredAt) 1289 }, 1290 updateRing: func(desc *ring.Desc) { 1291 instance := desc.Ingesters["alertmanager-1"] 1292 instance.Timestamp = time.Now().Add(time.Second).Unix() 1293 desc.Ingesters["alertmanager-1"] = instance 1294 }, 1295 expected: false, 1296 }, 1297 { 1298 name: "should NOT sync when an instance is auto-forgotten in the ring but was already unhealthy in the previous state", 1299 setupRing: func(desc *ring.Desc) { 1300 desc.AddIngester("alertmanager-1", "127.0.0.1", "", ring.Tokens{1, 2, 3}, ring.ACTIVE, registeredAt) 1301 desc.AddIngester("alertmanager-2", "127.0.0.2", "", ring.Tokens{4, 5, 6}, ring.ACTIVE, registeredAt) 1302 1303 instance := desc.Ingesters["alertmanager-2"] 1304 instance.Timestamp = time.Now().Add(-time.Hour).Unix() 1305 desc.Ingesters["alertmanager-2"] = instance 1306 }, 1307 updateRing: func(desc *ring.Desc) { 1308 desc.RemoveIngester("alertmanager-2") 1309 }, 1310 expected: false, 1311 }, 1312 } 1313 1314 for _, tt := range tc { 1315 t.Run(tt.name, func(t *testing.T) { 1316 ctx := context.Background() 1317 amConfig := mockAlertmanagerConfig(t) 1318 amConfig.ShardingEnabled = true 1319 amConfig.ShardingRing.RingCheckPeriod = 100 * time.Millisecond 1320 amConfig.PollInterval = time.Hour // Don't trigger the periodic check. 1321 1322 ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) 1323 t.Cleanup(func() { assert.NoError(t, closer.Close()) }) 1324 1325 alertStore := prepareInMemoryAlertStore() 1326 1327 reg := prometheus.NewPedanticRegistry() 1328 am, err := createMultitenantAlertmanager(amConfig, nil, nil, alertStore, ringStore, nil, log.NewNopLogger(), reg) 1329 require.NoError(t, err) 1330 1331 require.NoError(t, ringStore.CAS(ctx, RingKey, func(in interface{}) (interface{}, bool, error) { 1332 ringDesc := ring.GetOrCreateRingDesc(in) 1333 tt.setupRing(ringDesc) 1334 return ringDesc, true, nil 1335 })) 1336 1337 require.NoError(t, services.StartAndAwaitRunning(ctx, am)) 1338 defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck 1339 1340 // Make sure the initial sync happened. 1341 regs := util.NewUserRegistries() 1342 regs.AddUserRegistry("test", reg) 1343 metrics := regs.BuildMetricFamiliesPerUser() 1344 assert.Equal(t, float64(1), metrics.GetSumOfCounters("cortex_alertmanager_sync_configs_total")) 1345 1346 // Change the ring topology. 1347 require.NoError(t, ringStore.CAS(ctx, RingKey, func(in interface{}) (interface{}, bool, error) { 1348 ringDesc := ring.GetOrCreateRingDesc(in) 1349 tt.updateRing(ringDesc) 1350 return ringDesc, true, nil 1351 })) 1352 1353 // Assert if we expected an additional sync or not. 1354 expectedSyncs := 1 1355 if tt.expected { 1356 expectedSyncs++ 1357 } 1358 test.Poll(t, 3*time.Second, float64(expectedSyncs), func() interface{} { 1359 metrics := regs.BuildMetricFamiliesPerUser() 1360 return metrics.GetSumOfCounters("cortex_alertmanager_sync_configs_total") 1361 }) 1362 }) 1363 } 1364 } 1365 1366 func TestMultitenantAlertmanager_RingLifecyclerShouldAutoForgetUnhealthyInstances(t *testing.T) { 1367 const unhealthyInstanceID = "alertmanager-bad-1" 1368 const heartbeatTimeout = time.Minute 1369 ctx := context.Background() 1370 amConfig := mockAlertmanagerConfig(t) 1371 amConfig.ShardingEnabled = true 1372 amConfig.ShardingRing.HeartbeatPeriod = 100 * time.Millisecond 1373 amConfig.ShardingRing.HeartbeatTimeout = heartbeatTimeout 1374 1375 ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) 1376 t.Cleanup(func() { assert.NoError(t, closer.Close()) }) 1377 1378 alertStore := prepareInMemoryAlertStore() 1379 1380 am, err := createMultitenantAlertmanager(amConfig, nil, nil, alertStore, ringStore, nil, log.NewNopLogger(), nil) 1381 require.NoError(t, err) 1382 require.NoError(t, services.StartAndAwaitRunning(ctx, am)) 1383 defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck 1384 1385 require.NoError(t, ringStore.CAS(ctx, RingKey, func(in interface{}) (interface{}, bool, error) { 1386 ringDesc := ring.GetOrCreateRingDesc(in) 1387 instance := ringDesc.AddIngester(unhealthyInstanceID, "127.0.0.1", "", ring.GenerateTokens(RingNumTokens, nil), ring.ACTIVE, time.Now()) 1388 instance.Timestamp = time.Now().Add(-(ringAutoForgetUnhealthyPeriods + 1) * heartbeatTimeout).Unix() 1389 ringDesc.Ingesters[unhealthyInstanceID] = instance 1390 1391 return ringDesc, true, nil 1392 })) 1393 1394 test.Poll(t, time.Second, false, func() interface{} { 1395 d, err := ringStore.Get(ctx, RingKey) 1396 if err != nil { 1397 return err 1398 } 1399 1400 _, ok := ring.GetOrCreateRingDesc(d).Ingesters[unhealthyInstanceID] 1401 return ok 1402 }) 1403 } 1404 1405 func TestMultitenantAlertmanager_InitialSyncFailureWithSharding(t *testing.T) { 1406 ctx := context.Background() 1407 amConfig := mockAlertmanagerConfig(t) 1408 amConfig.ShardingEnabled = true 1409 ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) 1410 t.Cleanup(func() { assert.NoError(t, closer.Close()) }) 1411 1412 // Mock the store to fail listing configs. 1413 bkt := &bucket.ClientMock{} 1414 bkt.MockIter("alerts/", nil, errors.New("failed to list alerts")) 1415 bkt.MockIter("alertmanager/", nil, nil) 1416 store := bucketclient.NewBucketAlertStore(bkt, nil, log.NewNopLogger()) 1417 1418 am, err := createMultitenantAlertmanager(amConfig, nil, nil, store, ringStore, nil, log.NewNopLogger(), nil) 1419 require.NoError(t, err) 1420 defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck 1421 1422 require.NoError(t, am.StartAsync(ctx)) 1423 err = am.AwaitRunning(ctx) 1424 require.Error(t, err) 1425 require.Equal(t, services.Failed, am.State()) 1426 require.False(t, am.ringLifecycler.IsRegistered()) 1427 require.NotNil(t, am.ring) 1428 } 1429 1430 func TestAlertmanager_ReplicasPosition(t *testing.T) { 1431 ctx := context.Background() 1432 ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) 1433 t.Cleanup(func() { assert.NoError(t, closer.Close()) }) 1434 1435 mockStore := prepareInMemoryAlertStore() 1436 require.NoError(t, mockStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 1437 User: "user-1", 1438 RawConfig: simpleConfigOne, 1439 Templates: []*alertspb.TemplateDesc{}, 1440 })) 1441 1442 var instances []*MultitenantAlertmanager 1443 var instanceIDs []string 1444 registries := util.NewUserRegistries() 1445 1446 // First, create the alertmanager instances, we'll use a replication factor of 3 and create 3 instances so that we can get the tenant on each replica. 1447 for i := 1; i <= 3; i++ { 1448 // instanceIDs = append(instanceIDs, fmt.Sprintf("alertmanager-%d", i)) 1449 instanceID := fmt.Sprintf("alertmanager-%d", i) 1450 1451 amConfig := mockAlertmanagerConfig(t) 1452 amConfig.ShardingRing.ReplicationFactor = 3 1453 amConfig.ShardingRing.InstanceID = instanceID 1454 amConfig.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i) 1455 1456 // Do not check the ring topology changes or poll in an interval in this test (we explicitly sync alertmanagers). 1457 amConfig.PollInterval = time.Hour 1458 amConfig.ShardingRing.RingCheckPeriod = time.Hour 1459 amConfig.ShardingEnabled = true 1460 1461 reg := prometheus.NewPedanticRegistry() 1462 am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, nil, log.NewNopLogger(), reg) 1463 require.NoError(t, err) 1464 defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck 1465 1466 require.NoError(t, services.StartAndAwaitRunning(ctx, am)) 1467 1468 instances = append(instances, am) 1469 instanceIDs = append(instanceIDs, instanceID) 1470 registries.AddUserRegistry(instanceID, reg) 1471 } 1472 1473 // We need make sure the ring is settled. The alertmanager is ready to be tested once all instances are ACTIVE and the ring settles. 1474 ctx, cancel := context.WithTimeout(ctx, 5*time.Second) 1475 defer cancel() 1476 1477 for _, am := range instances { 1478 for _, id := range instanceIDs { 1479 require.NoError(t, ring.WaitInstanceState(ctx, am.ring, id, ring.ACTIVE)) 1480 } 1481 } 1482 1483 // Now that the ring has settled, sync configs with the instances. 1484 for _, am := range instances { 1485 err := am.loadAndSyncConfigs(ctx, reasonRingChange) 1486 require.NoError(t, err) 1487 } 1488 1489 // Now that the ring has settled, we expect each AM instance to have a different position. 1490 // Let's walk through them and collect the positions. 1491 var positions []int 1492 for _, instance := range instances { 1493 instance.alertmanagersMtx.Lock() 1494 am, ok := instance.alertmanagers["user-1"] 1495 require.True(t, ok) 1496 positions = append(positions, am.state.Position()) 1497 instance.alertmanagersMtx.Unlock() 1498 } 1499 1500 require.ElementsMatch(t, []int{0, 1, 2}, positions) 1501 } 1502 1503 func TestAlertmanager_StateReplicationWithSharding(t *testing.T) { 1504 tc := []struct { 1505 name string 1506 replicationFactor int 1507 instances int 1508 withSharding bool 1509 }{ 1510 { 1511 name: "sharding disabled (hence no replication factor), 1 instance", 1512 withSharding: false, 1513 instances: 1, 1514 replicationFactor: 0, 1515 }, 1516 { 1517 name: "sharding enabled, RF = 2, 2 instances", 1518 withSharding: true, 1519 instances: 2, 1520 replicationFactor: 2, 1521 }, 1522 { 1523 name: "sharding enabled, RF = 3, 10 instance", 1524 withSharding: true, 1525 instances: 10, 1526 replicationFactor: 3, 1527 }, 1528 } 1529 1530 for _, tt := range tc { 1531 t.Run(tt.name, func(t *testing.T) { 1532 ctx := context.Background() 1533 ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) 1534 t.Cleanup(func() { assert.NoError(t, closer.Close()) }) 1535 1536 mockStore := prepareInMemoryAlertStore() 1537 clientPool := newPassthroughAlertmanagerClientPool() 1538 externalURL := flagext.URLValue{} 1539 err := externalURL.Set("http://localhost:8080/alertmanager") 1540 require.NoError(t, err) 1541 1542 var instances []*MultitenantAlertmanager 1543 var instanceIDs []string 1544 registries := util.NewUserRegistries() 1545 1546 // First, add the number of configs to the store. 1547 for i := 1; i <= 12; i++ { 1548 u := fmt.Sprintf("u-%d", i) 1549 require.NoError(t, mockStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 1550 User: u, 1551 RawConfig: simpleConfigOne, 1552 Templates: []*alertspb.TemplateDesc{}, 1553 })) 1554 } 1555 1556 // Then, create the alertmanager instances, start them and add their registries to the slice. 1557 for i := 1; i <= tt.instances; i++ { 1558 instanceIDs = append(instanceIDs, fmt.Sprintf("alertmanager-%d", i)) 1559 instanceID := fmt.Sprintf("alertmanager-%d", i) 1560 1561 amConfig := mockAlertmanagerConfig(t) 1562 amConfig.ExternalURL = externalURL 1563 amConfig.ShardingRing.ReplicationFactor = tt.replicationFactor 1564 amConfig.ShardingRing.InstanceID = instanceID 1565 amConfig.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i) 1566 1567 // Do not check the ring topology changes or poll in an interval in this test (we explicitly sync alertmanagers). 1568 amConfig.PollInterval = time.Hour 1569 amConfig.ShardingRing.RingCheckPeriod = time.Hour 1570 1571 if tt.withSharding { 1572 amConfig.ShardingEnabled = true 1573 } 1574 1575 reg := prometheus.NewPedanticRegistry() 1576 am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, nil, log.NewNopLogger(), reg) 1577 require.NoError(t, err) 1578 defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck 1579 1580 if tt.withSharding { 1581 clientPool.setServer(amConfig.ShardingRing.InstanceAddr+":0", am) 1582 am.alertmanagerClientsPool = clientPool 1583 } 1584 1585 require.NoError(t, services.StartAndAwaitRunning(ctx, am)) 1586 1587 instances = append(instances, am) 1588 instanceIDs = append(instanceIDs, instanceID) 1589 registries.AddUserRegistry(instanceID, reg) 1590 } 1591 1592 // If we're testing with sharding, we need make sure the ring is settled. 1593 if tt.withSharding { 1594 ctx, cancel := context.WithTimeout(ctx, 5*time.Second) 1595 defer cancel() 1596 1597 // The alertmanager is ready to be tested once all instances are ACTIVE and the ring settles. 1598 for _, am := range instances { 1599 for _, id := range instanceIDs { 1600 require.NoError(t, ring.WaitInstanceState(ctx, am.ring, id, ring.ACTIVE)) 1601 } 1602 } 1603 } 1604 1605 // Now that the ring has settled, sync configs with the instances. 1606 var numConfigs, numInstances int 1607 for _, am := range instances { 1608 err := am.loadAndSyncConfigs(ctx, reasonRingChange) 1609 require.NoError(t, err) 1610 numConfigs += len(am.cfgs) 1611 numInstances += len(am.alertmanagers) 1612 } 1613 1614 // With sharding enabled, we propagate messages over gRPC instead of using a gossip over TCP. 1615 // 1. First, get a random multitenant instance 1616 // We must pick an instance which actually has a user configured. 1617 var multitenantAM *MultitenantAlertmanager 1618 for { 1619 multitenantAM = instances[rand.Intn(len(instances))] 1620 1621 multitenantAM.alertmanagersMtx.Lock() 1622 amount := len(multitenantAM.alertmanagers) 1623 multitenantAM.alertmanagersMtx.Unlock() 1624 if amount > 0 { 1625 break 1626 } 1627 } 1628 1629 // 2. Then, get a random user that exists in that particular alertmanager instance. 1630 multitenantAM.alertmanagersMtx.Lock() 1631 require.Greater(t, len(multitenantAM.alertmanagers), 0) 1632 k := rand.Intn(len(multitenantAM.alertmanagers)) 1633 var userID string 1634 for u := range multitenantAM.alertmanagers { 1635 if k == 0 { 1636 userID = u 1637 break 1638 } 1639 k-- 1640 } 1641 multitenantAM.alertmanagersMtx.Unlock() 1642 1643 // 3. Now that we have our alertmanager user, let's create a silence and make sure it is replicated. 1644 silence := types.Silence{ 1645 Matchers: labels.Matchers{ 1646 {Name: "instance", Value: "prometheus-one"}, 1647 }, 1648 Comment: "Created for a test case.", 1649 StartsAt: time.Now(), 1650 EndsAt: time.Now().Add(time.Hour), 1651 } 1652 data, err := json.Marshal(silence) 1653 require.NoError(t, err) 1654 1655 // 4. Create the silence in one of the alertmanagers 1656 req := httptest.NewRequest(http.MethodPost, externalURL.String()+"/api/v2/silences", bytes.NewReader(data)) 1657 req.Header.Set("content-type", "application/json") 1658 reqCtx := user.InjectOrgID(req.Context(), userID) 1659 { 1660 w := httptest.NewRecorder() 1661 multitenantAM.serveRequest(w, req.WithContext(reqCtx)) 1662 1663 resp := w.Result() 1664 body, _ := ioutil.ReadAll(resp.Body) 1665 assert.Equal(t, http.StatusOK, w.Code) 1666 require.Regexp(t, regexp.MustCompile(`{"silenceID":".+"}`), string(body)) 1667 } 1668 1669 // If sharding is not enabled, we never propagate any messages amongst replicas in this way, and we can stop here. 1670 if !tt.withSharding { 1671 metrics := registries.BuildMetricFamiliesPerUser() 1672 1673 assert.Equal(t, float64(1), metrics.GetSumOfGauges("cortex_alertmanager_silences")) 1674 assert.Equal(t, float64(0), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_total")) 1675 assert.Equal(t, float64(0), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_failed_total")) 1676 return 1677 } 1678 1679 var metrics util.MetricFamiliesPerUser 1680 1681 // 5. Then, make sure it is propagated successfully. 1682 // Replication is asynchronous, so we may have to wait a short period of time. 1683 assert.Eventually(t, func() bool { 1684 metrics = registries.BuildMetricFamiliesPerUser() 1685 return (float64(tt.replicationFactor) == metrics.GetSumOfGauges("cortex_alertmanager_silences") && 1686 float64(tt.replicationFactor) == metrics.GetSumOfCounters("cortex_alertmanager_state_replication_total")) 1687 }, 5*time.Second, 100*time.Millisecond) 1688 1689 assert.Equal(t, float64(tt.replicationFactor), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_total")) 1690 assert.Equal(t, float64(0), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_failed_total")) 1691 1692 // 5b. Check the number of partial states merged are as we expect. 1693 // Partial states are currently replicated twice: 1694 // For RF=1 1 -> 0 = Total 0 merges 1695 // For RF=2 1 -> 1 -> 1 = Total 2 merges 1696 // For RF=3 1 -> 2 -> 4 = Total 6 merges 1697 nFanOut := tt.replicationFactor - 1 1698 nMerges := nFanOut + (nFanOut * nFanOut) 1699 1700 assert.Eventually(t, func() bool { 1701 metrics = registries.BuildMetricFamiliesPerUser() 1702 return float64(nMerges) == metrics.GetSumOfCounters("cortex_alertmanager_partial_state_merges_total") 1703 }, 5*time.Second, 100*time.Millisecond) 1704 1705 assert.Equal(t, float64(0), metrics.GetSumOfCounters("cortex_alertmanager_partial_state_merges_failed_total")) 1706 }) 1707 } 1708 } 1709 1710 func TestAlertmanager_StateReplicationWithSharding_InitialSyncFromPeers(t *testing.T) { 1711 tc := []struct { 1712 name string 1713 replicationFactor int 1714 }{ 1715 { 1716 name: "RF = 2", 1717 replicationFactor: 2, 1718 }, 1719 { 1720 name: "RF = 3", 1721 replicationFactor: 3, 1722 }, 1723 } 1724 1725 for _, tt := range tc { 1726 t.Run(tt.name, func(t *testing.T) { 1727 ctx := context.Background() 1728 ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) 1729 t.Cleanup(func() { assert.NoError(t, closer.Close()) }) 1730 1731 mockStore := prepareInMemoryAlertStore() 1732 clientPool := newPassthroughAlertmanagerClientPool() 1733 externalURL := flagext.URLValue{} 1734 err := externalURL.Set("http://localhost:8080/alertmanager") 1735 require.NoError(t, err) 1736 1737 var instances []*MultitenantAlertmanager 1738 var instanceIDs []string 1739 registries := util.NewUserRegistries() 1740 1741 // Create only two users - no need for more for these test cases. 1742 for i := 1; i <= 2; i++ { 1743 u := fmt.Sprintf("u-%d", i) 1744 require.NoError(t, mockStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 1745 User: u, 1746 RawConfig: simpleConfigOne, 1747 Templates: []*alertspb.TemplateDesc{}, 1748 })) 1749 } 1750 1751 createInstance := func(i int) *MultitenantAlertmanager { 1752 instanceIDs = append(instanceIDs, fmt.Sprintf("alertmanager-%d", i)) 1753 instanceID := fmt.Sprintf("alertmanager-%d", i) 1754 1755 amConfig := mockAlertmanagerConfig(t) 1756 amConfig.ExternalURL = externalURL 1757 amConfig.ShardingRing.ReplicationFactor = tt.replicationFactor 1758 amConfig.ShardingRing.InstanceID = instanceID 1759 amConfig.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i) 1760 1761 // Do not check the ring topology changes or poll in an interval in this test (we explicitly sync alertmanagers). 1762 amConfig.PollInterval = time.Hour 1763 amConfig.ShardingRing.RingCheckPeriod = time.Hour 1764 1765 amConfig.ShardingEnabled = true 1766 1767 reg := prometheus.NewPedanticRegistry() 1768 am, err := createMultitenantAlertmanager(amConfig, nil, nil, mockStore, ringStore, nil, log.NewNopLogger(), reg) 1769 require.NoError(t, err) 1770 1771 clientPool.setServer(amConfig.ShardingRing.InstanceAddr+":0", am) 1772 am.alertmanagerClientsPool = clientPool 1773 1774 require.NoError(t, services.StartAndAwaitRunning(ctx, am)) 1775 t.Cleanup(func() { 1776 require.NoError(t, services.StopAndAwaitTerminated(ctx, am)) 1777 }) 1778 1779 instances = append(instances, am) 1780 instanceIDs = append(instanceIDs, instanceID) 1781 registries.AddUserRegistry(instanceID, reg) 1782 1783 // Make sure the ring is settled. 1784 { 1785 ctx, cancel := context.WithTimeout(ctx, 10*time.Second) 1786 defer cancel() 1787 1788 // The alertmanager is ready to be tested once all instances are ACTIVE and the ring settles. 1789 for _, am := range instances { 1790 for _, id := range instanceIDs { 1791 require.NoError(t, ring.WaitInstanceState(ctx, am.ring, id, ring.ACTIVE)) 1792 } 1793 } 1794 } 1795 1796 // Now that the ring has settled, sync configs with the instances. 1797 require.NoError(t, am.loadAndSyncConfigs(ctx, reasonRingChange)) 1798 1799 return am 1800 } 1801 1802 writeSilence := func(i *MultitenantAlertmanager, userID string) { 1803 silence := types.Silence{ 1804 Matchers: labels.Matchers{ 1805 {Name: "instance", Value: "prometheus-one"}, 1806 }, 1807 Comment: "Created for a test case.", 1808 StartsAt: time.Now(), 1809 EndsAt: time.Now().Add(time.Hour), 1810 } 1811 data, err := json.Marshal(silence) 1812 require.NoError(t, err) 1813 1814 req := httptest.NewRequest(http.MethodPost, externalURL.String()+"/api/v2/silences", bytes.NewReader(data)) 1815 req.Header.Set("content-type", "application/json") 1816 reqCtx := user.InjectOrgID(req.Context(), userID) 1817 { 1818 w := httptest.NewRecorder() 1819 i.serveRequest(w, req.WithContext(reqCtx)) 1820 1821 resp := w.Result() 1822 body, _ := ioutil.ReadAll(resp.Body) 1823 assert.Equal(t, http.StatusOK, w.Code) 1824 require.Regexp(t, regexp.MustCompile(`{"silenceID":".+"}`), string(body)) 1825 } 1826 } 1827 1828 checkSilence := func(i *MultitenantAlertmanager, userID string) { 1829 req := httptest.NewRequest(http.MethodGet, externalURL.String()+"/api/v2/silences", nil) 1830 req.Header.Set("content-type", "application/json") 1831 reqCtx := user.InjectOrgID(req.Context(), userID) 1832 { 1833 w := httptest.NewRecorder() 1834 i.serveRequest(w, req.WithContext(reqCtx)) 1835 1836 resp := w.Result() 1837 body, _ := ioutil.ReadAll(resp.Body) 1838 assert.Equal(t, http.StatusOK, w.Code) 1839 require.Regexp(t, regexp.MustCompile(`"comment":"Created for a test case."`), string(body)) 1840 } 1841 } 1842 1843 // 1. Create the first instance and load the user configurations. 1844 i1 := createInstance(1) 1845 1846 // 2. Create a silence in the first alertmanager instance and check we can read it. 1847 writeSilence(i1, "u-1") 1848 // 2.a. Check the silence was created (paranoia). 1849 checkSilence(i1, "u-1") 1850 // 2.b. Check the relevant metrics were updated. 1851 { 1852 metrics := registries.BuildMetricFamiliesPerUser() 1853 assert.Equal(t, float64(1), metrics.GetSumOfGauges("cortex_alertmanager_silences")) 1854 } 1855 // 2.c. Wait for the silence replication to be attempted; note this is asynchronous. 1856 { 1857 test.Poll(t, 5*time.Second, float64(1), func() interface{} { 1858 metrics := registries.BuildMetricFamiliesPerUser() 1859 return metrics.GetSumOfCounters("cortex_alertmanager_state_replication_total") 1860 }) 1861 metrics := registries.BuildMetricFamiliesPerUser() 1862 assert.Equal(t, float64(0), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_failed_total")) 1863 } 1864 1865 // 3. Create a second instance. This should attempt to fetch the silence from the first. 1866 i2 := createInstance(2) 1867 1868 // 3.a. Check the silence was fetched from the first instance successfully. 1869 checkSilence(i2, "u-1") 1870 1871 // 3.b. Check the metrics: We should see the additional silences without any replication activity. 1872 { 1873 metrics := registries.BuildMetricFamiliesPerUser() 1874 assert.Equal(t, float64(2), metrics.GetSumOfGauges("cortex_alertmanager_silences")) 1875 assert.Equal(t, float64(1), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_total")) 1876 assert.Equal(t, float64(0), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_failed_total")) 1877 } 1878 1879 if tt.replicationFactor >= 3 { 1880 // 4. When testing RF = 3, create a third instance, to test obtaining state from multiple places. 1881 i3 := createInstance(3) 1882 1883 // 4.a. Check the silence was fetched one or both of the instances successfully. 1884 checkSilence(i3, "u-1") 1885 1886 // 4.b. Check the metrics one more time. We should have three replicas of the silence. 1887 { 1888 metrics := registries.BuildMetricFamiliesPerUser() 1889 assert.Equal(t, float64(3), metrics.GetSumOfGauges("cortex_alertmanager_silences")) 1890 assert.Equal(t, float64(1), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_total")) 1891 assert.Equal(t, float64(0), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_failed_total")) 1892 } 1893 } 1894 }) 1895 } 1896 } 1897 1898 // prepareInMemoryAlertStore builds and returns an in-memory alert store. 1899 func prepareInMemoryAlertStore() alertstore.AlertStore { 1900 return bucketclient.NewBucketAlertStore(objstore.NewInMemBucket(), nil, log.NewNopLogger()) 1901 } 1902 1903 func TestSafeTemplateFilepath(t *testing.T) { 1904 tests := map[string]struct { 1905 dir string 1906 template string 1907 expectedPath string 1908 expectedErr error 1909 }{ 1910 "should succeed if the provided template is a filename": { 1911 dir: "/data/tenant", 1912 template: "test.tmpl", 1913 expectedPath: "/data/tenant/test.tmpl", 1914 }, 1915 "should fail if the provided template is escaping the dir": { 1916 dir: "/data/tenant", 1917 template: "../test.tmpl", 1918 expectedErr: errors.New(`invalid template name "../test.tmpl": the template filepath is escaping the per-tenant local directory`), 1919 }, 1920 } 1921 1922 for testName, testData := range tests { 1923 t.Run(testName, func(t *testing.T) { 1924 actualPath, actualErr := safeTemplateFilepath(testData.dir, testData.template) 1925 assert.Equal(t, testData.expectedErr, actualErr) 1926 assert.Equal(t, testData.expectedPath, actualPath) 1927 }) 1928 } 1929 } 1930 1931 func TestStoreTemplateFile(t *testing.T) { 1932 tempDir, err := ioutil.TempDir(os.TempDir(), "alertmanager") 1933 require.NoError(t, err) 1934 1935 t.Cleanup(func() { 1936 require.NoError(t, os.RemoveAll(tempDir)) 1937 }) 1938 1939 testTemplateDir := filepath.Join(tempDir, templatesDir) 1940 1941 changed, err := storeTemplateFile(filepath.Join(testTemplateDir, "some-template"), "content") 1942 require.NoError(t, err) 1943 require.True(t, changed) 1944 1945 changed, err = storeTemplateFile(filepath.Join(testTemplateDir, "some-template"), "new content") 1946 require.NoError(t, err) 1947 require.True(t, changed) 1948 1949 changed, err = storeTemplateFile(filepath.Join(testTemplateDir, "some-template"), "new content") // reusing previous content 1950 require.NoError(t, err) 1951 require.False(t, changed) 1952 } 1953 1954 func TestMultitenantAlertmanager_verifyRateLimitedEmailConfig(t *testing.T) { 1955 ctx := context.Background() 1956 1957 config := `global: 1958 resolve_timeout: 1m 1959 smtp_require_tls: false 1960 1961 route: 1962 receiver: 'email' 1963 1964 receivers: 1965 - name: 'email' 1966 email_configs: 1967 - to: test@example.com 1968 from: test@example.com 1969 smarthost: smtp:2525 1970 ` 1971 1972 // Run this test using a real storage client. 1973 store := prepareInMemoryAlertStore() 1974 require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ 1975 User: "user", 1976 RawConfig: config, 1977 Templates: []*alertspb.TemplateDesc{}, 1978 })) 1979 1980 limits := mockAlertManagerLimits{ 1981 emailNotificationRateLimit: 0, 1982 emailNotificationBurst: 0, 1983 } 1984 1985 reg := prometheus.NewPedanticRegistry() 1986 cfg := mockAlertmanagerConfig(t) 1987 am, err := createMultitenantAlertmanager(cfg, nil, nil, store, nil, &limits, log.NewNopLogger(), reg) 1988 require.NoError(t, err) 1989 1990 err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) 1991 require.NoError(t, err) 1992 require.Len(t, am.alertmanagers, 1) 1993 1994 am.alertmanagersMtx.Lock() 1995 uam := am.alertmanagers["user"] 1996 am.alertmanagersMtx.Unlock() 1997 1998 require.NotNil(t, uam) 1999 2000 ctx = notify.WithReceiverName(ctx, "email") 2001 ctx = notify.WithGroupKey(ctx, "key") 2002 ctx = notify.WithRepeatInterval(ctx, time.Minute) 2003 2004 // Verify that rate-limiter is in place for email notifier. 2005 _, _, err = uam.lastPipeline.Exec(ctx, log.NewNopLogger(), &types.Alert{}) 2006 require.NotNil(t, err) 2007 require.Contains(t, err.Error(), errRateLimited.Error()) 2008 } 2009 2010 type passthroughAlertmanagerClient struct { 2011 server alertmanagerpb.AlertmanagerServer 2012 } 2013 2014 func (am *passthroughAlertmanagerClient) UpdateState(ctx context.Context, in *clusterpb.Part, opts ...grpc.CallOption) (*alertmanagerpb.UpdateStateResponse, error) { 2015 return am.server.UpdateState(ctx, in) 2016 } 2017 2018 func (am *passthroughAlertmanagerClient) ReadState(ctx context.Context, in *alertmanagerpb.ReadStateRequest, opts ...grpc.CallOption) (*alertmanagerpb.ReadStateResponse, error) { 2019 return am.server.ReadState(ctx, in) 2020 } 2021 2022 func (am *passthroughAlertmanagerClient) HandleRequest(context.Context, *httpgrpc.HTTPRequest, ...grpc.CallOption) (*httpgrpc.HTTPResponse, error) { 2023 return nil, fmt.Errorf("unexpected call to HandleRequest") 2024 } 2025 2026 func (am *passthroughAlertmanagerClient) RemoteAddress() string { 2027 return "" 2028 } 2029 2030 // passthroughAlertmanagerClientPool allows testing the logic of gRPC calls between alertmanager instances 2031 // by invoking client calls directly to a peer instance in the unit test, without the server running. 2032 type passthroughAlertmanagerClientPool struct { 2033 serversMtx sync.Mutex 2034 servers map[string]alertmanagerpb.AlertmanagerServer 2035 } 2036 2037 func newPassthroughAlertmanagerClientPool() *passthroughAlertmanagerClientPool { 2038 return &passthroughAlertmanagerClientPool{ 2039 servers: make(map[string]alertmanagerpb.AlertmanagerServer), 2040 } 2041 } 2042 2043 func (f *passthroughAlertmanagerClientPool) setServer(addr string, server alertmanagerpb.AlertmanagerServer) { 2044 f.serversMtx.Lock() 2045 defer f.serversMtx.Unlock() 2046 f.servers[addr] = server 2047 } 2048 2049 func (f *passthroughAlertmanagerClientPool) GetClientFor(addr string) (Client, error) { 2050 f.serversMtx.Lock() 2051 defer f.serversMtx.Unlock() 2052 s, ok := f.servers[addr] 2053 if !ok { 2054 return nil, fmt.Errorf("client not found for address: %v", addr) 2055 } 2056 return Client(&passthroughAlertmanagerClient{s}), nil 2057 } 2058 2059 type mockAlertManagerLimits struct { 2060 emailNotificationRateLimit rate.Limit 2061 emailNotificationBurst int 2062 maxConfigSize int 2063 maxTemplatesCount int 2064 maxSizeOfTemplate int 2065 maxDispatcherAggregationGroups int 2066 maxAlertsCount int 2067 maxAlertsSizeBytes int 2068 } 2069 2070 func (m *mockAlertManagerLimits) AlertmanagerMaxConfigSize(tenant string) int { 2071 return m.maxConfigSize 2072 } 2073 2074 func (m *mockAlertManagerLimits) AlertmanagerMaxTemplatesCount(tenant string) int { 2075 return m.maxTemplatesCount 2076 } 2077 2078 func (m *mockAlertManagerLimits) AlertmanagerMaxTemplateSize(tenant string) int { 2079 return m.maxSizeOfTemplate 2080 } 2081 2082 func (m *mockAlertManagerLimits) AlertmanagerReceiversBlockCIDRNetworks(user string) []flagext.CIDR { 2083 panic("implement me") 2084 } 2085 2086 func (m *mockAlertManagerLimits) AlertmanagerReceiversBlockPrivateAddresses(user string) bool { 2087 panic("implement me") 2088 } 2089 2090 func (m *mockAlertManagerLimits) NotificationRateLimit(_ string, integration string) rate.Limit { 2091 return m.emailNotificationRateLimit 2092 } 2093 2094 func (m *mockAlertManagerLimits) NotificationBurstSize(_ string, integration string) int { 2095 return m.emailNotificationBurst 2096 } 2097 2098 func (m *mockAlertManagerLimits) AlertmanagerMaxDispatcherAggregationGroups(_ string) int { 2099 return m.maxDispatcherAggregationGroups 2100 } 2101 2102 func (m *mockAlertManagerLimits) AlertmanagerMaxAlertsCount(_ string) int { 2103 return m.maxAlertsCount 2104 } 2105 2106 func (m *mockAlertManagerLimits) AlertmanagerMaxAlertsSizeBytes(_ string) int { 2107 return m.maxAlertsSizeBytes 2108 }