github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/alertmanager/state_persister.go (about) 1 package alertmanager 2 3 import ( 4 "context" 5 "flag" 6 "time" 7 8 "github.com/go-kit/log" 9 "github.com/go-kit/log/level" 10 "github.com/grafana/dskit/services" 11 "github.com/pkg/errors" 12 "github.com/prometheus/alertmanager/cluster/clusterpb" 13 "github.com/prometheus/client_golang/prometheus" 14 "github.com/prometheus/client_golang/prometheus/promauto" 15 16 "github.com/cortexproject/cortex/pkg/alertmanager/alertspb" 17 "github.com/cortexproject/cortex/pkg/alertmanager/alertstore" 18 ) 19 20 const ( 21 defaultPersistTimeout = 30 * time.Second 22 ) 23 24 var ( 25 errInvalidPersistInterval = errors.New("invalid alertmanager persist interval, must be greater than zero") 26 ) 27 28 type PersisterConfig struct { 29 Interval time.Duration `yaml:"persist_interval"` 30 } 31 32 func (cfg *PersisterConfig) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { 33 f.DurationVar(&cfg.Interval, prefix+".persist-interval", 15*time.Minute, "The interval between persisting the current alertmanager state (notification log and silences) to object storage. This is only used when sharding is enabled. This state is read when all replicas for a shard can not be contacted. In this scenario, having persisted the state more frequently will result in potentially fewer lost silences, and fewer duplicate notifications.") 34 } 35 36 func (cfg *PersisterConfig) Validate() error { 37 if cfg.Interval <= 0 { 38 return errInvalidPersistInterval 39 } 40 return nil 41 } 42 43 type PersistableState interface { 44 State 45 GetFullState() (*clusterpb.FullState, error) 46 } 47 48 // statePersister periodically writes the alertmanager state to persistent storage. 49 type statePersister struct { 50 services.Service 51 52 state PersistableState 53 store alertstore.AlertStore 54 userID string 55 logger log.Logger 56 57 timeout time.Duration 58 59 persistTotal prometheus.Counter 60 persistFailed prometheus.Counter 61 } 62 63 // newStatePersister creates a new state persister. 64 func newStatePersister(cfg PersisterConfig, userID string, state PersistableState, store alertstore.AlertStore, l log.Logger, r prometheus.Registerer) *statePersister { 65 66 s := &statePersister{ 67 state: state, 68 store: store, 69 userID: userID, 70 logger: l, 71 timeout: defaultPersistTimeout, 72 persistTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 73 Name: "alertmanager_state_persist_total", 74 Help: "Number of times we have tried to persist the running state to remote storage.", 75 }), 76 persistFailed: promauto.With(r).NewCounter(prometheus.CounterOpts{ 77 Name: "alertmanager_state_persist_failed_total", 78 Help: "Number of times we have failed to persist the running state to remote storage.", 79 }), 80 } 81 82 s.Service = services.NewTimerService(cfg.Interval, s.starting, s.iteration, nil) 83 84 return s 85 } 86 87 func (s *statePersister) starting(ctx context.Context) error { 88 // Waits until the state replicator is settled, so that state is not 89 // persisted before obtaining some initial state. 90 return s.state.WaitReady(ctx) 91 } 92 93 func (s *statePersister) iteration(ctx context.Context) error { 94 if err := s.persist(ctx); err != nil { 95 level.Error(s.logger).Log("msg", "failed to persist state", "user", s.userID, "err", err) 96 } 97 return nil 98 } 99 100 func (s *statePersister) persist(ctx context.Context) (err error) { 101 // Only the replica at position zero should write the state. 102 if s.state.Position() != 0 { 103 return nil 104 } 105 106 s.persistTotal.Inc() 107 defer func() { 108 if err != nil { 109 s.persistFailed.Inc() 110 } 111 }() 112 113 level.Debug(s.logger).Log("msg", "persisting state", "user", s.userID) 114 115 var fs *clusterpb.FullState 116 fs, err = s.state.GetFullState() 117 if err != nil { 118 return err 119 } 120 121 ctx, cancel := context.WithTimeout(ctx, s.timeout) 122 defer cancel() 123 124 desc := alertspb.FullStateDesc{State: fs} 125 if err = s.store.SetFullState(ctx, s.userID, desc); err != nil { 126 return err 127 } 128 129 return nil 130 }