github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/alertmanager/state_persister.go (about)

     1  package alertmanager
     2  
     3  import (
     4  	"context"
     5  	"flag"
     6  	"time"
     7  
     8  	"github.com/go-kit/log"
     9  	"github.com/go-kit/log/level"
    10  	"github.com/grafana/dskit/services"
    11  	"github.com/pkg/errors"
    12  	"github.com/prometheus/alertmanager/cluster/clusterpb"
    13  	"github.com/prometheus/client_golang/prometheus"
    14  	"github.com/prometheus/client_golang/prometheus/promauto"
    15  
    16  	"github.com/cortexproject/cortex/pkg/alertmanager/alertspb"
    17  	"github.com/cortexproject/cortex/pkg/alertmanager/alertstore"
    18  )
    19  
    20  const (
    21  	defaultPersistTimeout = 30 * time.Second
    22  )
    23  
    24  var (
    25  	errInvalidPersistInterval = errors.New("invalid alertmanager persist interval, must be greater than zero")
    26  )
    27  
    28  type PersisterConfig struct {
    29  	Interval time.Duration `yaml:"persist_interval"`
    30  }
    31  
    32  func (cfg *PersisterConfig) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) {
    33  	f.DurationVar(&cfg.Interval, prefix+".persist-interval", 15*time.Minute, "The interval between persisting the current alertmanager state (notification log and silences) to object storage. This is only used when sharding is enabled. This state is read when all replicas for a shard can not be contacted. In this scenario, having persisted the state more frequently will result in potentially fewer lost silences, and fewer duplicate notifications.")
    34  }
    35  
    36  func (cfg *PersisterConfig) Validate() error {
    37  	if cfg.Interval <= 0 {
    38  		return errInvalidPersistInterval
    39  	}
    40  	return nil
    41  }
    42  
    43  type PersistableState interface {
    44  	State
    45  	GetFullState() (*clusterpb.FullState, error)
    46  }
    47  
    48  // statePersister periodically writes the alertmanager state to persistent storage.
    49  type statePersister struct {
    50  	services.Service
    51  
    52  	state  PersistableState
    53  	store  alertstore.AlertStore
    54  	userID string
    55  	logger log.Logger
    56  
    57  	timeout time.Duration
    58  
    59  	persistTotal  prometheus.Counter
    60  	persistFailed prometheus.Counter
    61  }
    62  
    63  // newStatePersister creates a new state persister.
    64  func newStatePersister(cfg PersisterConfig, userID string, state PersistableState, store alertstore.AlertStore, l log.Logger, r prometheus.Registerer) *statePersister {
    65  
    66  	s := &statePersister{
    67  		state:   state,
    68  		store:   store,
    69  		userID:  userID,
    70  		logger:  l,
    71  		timeout: defaultPersistTimeout,
    72  		persistTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
    73  			Name: "alertmanager_state_persist_total",
    74  			Help: "Number of times we have tried to persist the running state to remote storage.",
    75  		}),
    76  		persistFailed: promauto.With(r).NewCounter(prometheus.CounterOpts{
    77  			Name: "alertmanager_state_persist_failed_total",
    78  			Help: "Number of times we have failed to persist the running state to remote storage.",
    79  		}),
    80  	}
    81  
    82  	s.Service = services.NewTimerService(cfg.Interval, s.starting, s.iteration, nil)
    83  
    84  	return s
    85  }
    86  
    87  func (s *statePersister) starting(ctx context.Context) error {
    88  	// Waits until the state replicator is settled, so that state is not
    89  	// persisted before obtaining some initial state.
    90  	return s.state.WaitReady(ctx)
    91  }
    92  
    93  func (s *statePersister) iteration(ctx context.Context) error {
    94  	if err := s.persist(ctx); err != nil {
    95  		level.Error(s.logger).Log("msg", "failed to persist state", "user", s.userID, "err", err)
    96  	}
    97  	return nil
    98  }
    99  
   100  func (s *statePersister) persist(ctx context.Context) (err error) {
   101  	// Only the replica at position zero should write the state.
   102  	if s.state.Position() != 0 {
   103  		return nil
   104  	}
   105  
   106  	s.persistTotal.Inc()
   107  	defer func() {
   108  		if err != nil {
   109  			s.persistFailed.Inc()
   110  		}
   111  	}()
   112  
   113  	level.Debug(s.logger).Log("msg", "persisting state", "user", s.userID)
   114  
   115  	var fs *clusterpb.FullState
   116  	fs, err = s.state.GetFullState()
   117  	if err != nil {
   118  		return err
   119  	}
   120  
   121  	ctx, cancel := context.WithTimeout(ctx, s.timeout)
   122  	defer cancel()
   123  
   124  	desc := alertspb.FullStateDesc{State: fs}
   125  	if err = s.store.SetFullState(ctx, s.userID, desc); err != nil {
   126  		return err
   127  	}
   128  
   129  	return nil
   130  }