github.com/thanos-io/thanos@v0.32.5/pkg/alert/alert.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  // Package alert contains logic to send alert notifications to Alertmanager clusters.
     5  package alert
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"encoding/json"
    11  	"fmt"
    12  	"io"
    13  	"net/http"
    14  	"net/url"
    15  	"path"
    16  	"sync"
    17  	"time"
    18  
    19  	"github.com/go-kit/log"
    20  	"github.com/go-kit/log/level"
    21  	"github.com/go-openapi/strfmt"
    22  	"github.com/pkg/errors"
    23  	"github.com/prometheus/alertmanager/api/v2/models"
    24  	"github.com/prometheus/client_golang/prometheus"
    25  	"github.com/prometheus/client_golang/prometheus/promauto"
    26  	"github.com/prometheus/prometheus/model/labels"
    27  	"github.com/prometheus/prometheus/model/relabel"
    28  	"github.com/prometheus/prometheus/notifier"
    29  	"go.uber.org/atomic"
    30  
    31  	"github.com/thanos-io/thanos/pkg/runutil"
    32  	"github.com/thanos-io/thanos/pkg/tracing"
    33  )
    34  
    35  const (
    36  	defaultAlertmanagerPort = 9093
    37  	contentTypeJSON         = "application/json"
    38  )
    39  
    40  // Queue is a queue of alert notifications waiting to be sent. The queue is consumed in batches
    41  // and entries are dropped at the front if it runs full.
    42  type Queue struct {
    43  	logger              log.Logger
    44  	maxBatchSize        int
    45  	capacity            int
    46  	toAddLset           labels.Labels
    47  	toExcludeLabels     labels.Labels
    48  	alertRelabelConfigs []*relabel.Config
    49  
    50  	mtx   sync.Mutex
    51  	queue []*notifier.Alert
    52  	morec chan struct{}
    53  
    54  	pushed  prometheus.Counter
    55  	popped  prometheus.Counter
    56  	dropped prometheus.Counter
    57  }
    58  
    59  func relabelLabels(lset labels.Labels, excludeLset []string) (toAdd, toExclude labels.Labels) {
    60  	for _, ln := range excludeLset {
    61  		toExclude = append(toExclude, labels.Label{Name: ln})
    62  	}
    63  
    64  	for _, l := range lset {
    65  		// Exclude labels to  to add straight away.
    66  		if toExclude.Has(l.Name) {
    67  			continue
    68  		}
    69  		toAdd = append(toAdd, labels.Label{
    70  			Name:  l.Name,
    71  			Value: l.Value,
    72  		})
    73  	}
    74  	return toAdd, toExclude
    75  }
    76  
    77  // NewQueue returns a new queue. The given label set is attached to all alerts pushed to the queue.
    78  // The given exclude label set tells what label names to drop including external labels.
    79  func NewQueue(logger log.Logger, reg prometheus.Registerer, capacity, maxBatchSize int, externalLset labels.Labels, excludeLabels []string, alertRelabelConfigs []*relabel.Config) *Queue {
    80  	toAdd, toExclude := relabelLabels(externalLset, excludeLabels)
    81  
    82  	if logger == nil {
    83  		logger = log.NewNopLogger()
    84  	}
    85  	q := &Queue{
    86  		logger:              logger,
    87  		capacity:            capacity,
    88  		morec:               make(chan struct{}, 1),
    89  		maxBatchSize:        maxBatchSize,
    90  		toAddLset:           toAdd,
    91  		toExcludeLabels:     toExclude,
    92  		alertRelabelConfigs: alertRelabelConfigs,
    93  
    94  		dropped: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    95  			Name: "thanos_alert_queue_alerts_dropped_total",
    96  			Help: "Total number of alerts that were dropped from the queue.",
    97  		}),
    98  		pushed: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    99  			Name: "thanos_alert_queue_alerts_pushed_total",
   100  			Help: "Total number of alerts pushed to the queue.",
   101  		}),
   102  		popped: promauto.With(reg).NewCounter(prometheus.CounterOpts{
   103  			Name: "thanos_alert_queue_alerts_popped_total",
   104  			Help: "Total number of alerts popped from the queue.",
   105  		}),
   106  	}
   107  	_ = promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{
   108  		Name: "thanos_alert_queue_capacity",
   109  		Help: "Capacity of the alert queue.",
   110  	}, func() float64 {
   111  		return float64(q.Cap())
   112  	})
   113  	_ = promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{
   114  		Name: "thanos_alert_queue_length",
   115  		Help: "Length of the alert queue.",
   116  	}, func() float64 {
   117  		return float64(q.Len())
   118  	})
   119  	return q
   120  }
   121  
   122  // Len returns the current length of the queue.
   123  func (q *Queue) Len() int {
   124  	q.mtx.Lock()
   125  	defer q.mtx.Unlock()
   126  	return len(q.queue)
   127  }
   128  
   129  // Cap returns the fixed capacity of the queue.
   130  func (q *Queue) Cap() int {
   131  	return q.capacity
   132  }
   133  
   134  // Pop takes a batch of alerts from the front of the queue. The batch size is limited
   135  // according to the queues maxBatchSize limit.
   136  // It blocks until elements are available or a termination signal is send on termc.
   137  func (q *Queue) Pop(termc <-chan struct{}) []*notifier.Alert {
   138  	select {
   139  	case <-termc:
   140  		return nil
   141  	case <-q.morec:
   142  	}
   143  
   144  	q.mtx.Lock()
   145  	defer q.mtx.Unlock()
   146  
   147  	as := make([]*notifier.Alert, q.maxBatchSize)
   148  	n := copy(as, q.queue)
   149  	q.queue = q.queue[n:]
   150  
   151  	q.popped.Add(float64(n))
   152  
   153  	if len(q.queue) > 0 {
   154  		select {
   155  		case q.morec <- struct{}{}:
   156  		default:
   157  		}
   158  	}
   159  	return as[:n]
   160  }
   161  
   162  // Push adds a list of alerts to the queue.
   163  func (q *Queue) Push(alerts []*notifier.Alert) {
   164  	if len(alerts) == 0 {
   165  		return
   166  	}
   167  
   168  	q.mtx.Lock()
   169  	defer q.mtx.Unlock()
   170  
   171  	q.pushed.Add(float64(len(alerts)))
   172  
   173  	// Attach external labels, drop excluded labels and process relabeling before sending.
   174  	var relabeledAlerts []*notifier.Alert
   175  	for _, a := range alerts {
   176  		lb := labels.NewBuilder(labels.Labels{})
   177  		for _, l := range a.Labels {
   178  			if q.toExcludeLabels.Has(l.Name) {
   179  				continue
   180  			}
   181  			lb.Set(l.Name, l.Value)
   182  		}
   183  		for _, l := range q.toAddLset {
   184  			lb.Set(l.Name, l.Value)
   185  		}
   186  
   187  		if lset, keep := relabel.Process(lb.Labels(), q.alertRelabelConfigs...); keep {
   188  			a.Labels = lset
   189  			relabeledAlerts = append(relabeledAlerts, a)
   190  		}
   191  	}
   192  
   193  	alerts = relabeledAlerts
   194  	if len(alerts) == 0 {
   195  		return
   196  	}
   197  	// Queue capacity should be significantly larger than a single alert
   198  	// batch could be.
   199  	if d := len(alerts) - q.capacity; d > 0 {
   200  		alerts = alerts[d:]
   201  
   202  		level.Warn(q.logger).Log(
   203  			"msg", "Alert batch larger than queue capacity, dropping alerts",
   204  			"numDropped", d)
   205  		q.dropped.Add(float64(d))
   206  	}
   207  
   208  	// If the queue is full, remove the oldest alerts in favor
   209  	// of newer ones.
   210  	if d := (len(q.queue) + len(alerts)) - q.capacity; d > 0 {
   211  		q.queue = q.queue[d:]
   212  
   213  		level.Warn(q.logger).Log(
   214  			"msg", "Alert notification queue full, dropping alerts",
   215  			"numDropped", d)
   216  		q.dropped.Add(float64(d))
   217  	}
   218  
   219  	q.queue = append(q.queue, alerts...)
   220  
   221  	select {
   222  	case q.morec <- struct{}{}:
   223  	default:
   224  	}
   225  }
   226  
   227  // Sender sends notifications to a dynamic set of alertmanagers.
   228  type Sender struct {
   229  	logger        log.Logger
   230  	alertmanagers []*Alertmanager
   231  	versions      []APIVersion
   232  
   233  	sent    *prometheus.CounterVec
   234  	errs    *prometheus.CounterVec
   235  	dropped prometheus.Counter
   236  	latency *prometheus.HistogramVec
   237  }
   238  
   239  // NewSender returns a new sender. On each call to Send the entire alert batch is sent
   240  // to each Alertmanager returned by the getter function.
   241  func NewSender(
   242  	logger log.Logger,
   243  	reg prometheus.Registerer,
   244  	alertmanagers []*Alertmanager,
   245  ) *Sender {
   246  	if logger == nil {
   247  		logger = log.NewNopLogger()
   248  	}
   249  	var (
   250  		versions       []APIVersion
   251  		versionPresent map[APIVersion]struct{}
   252  	)
   253  	for _, am := range alertmanagers {
   254  		if _, found := versionPresent[am.version]; found {
   255  			continue
   256  		}
   257  		versions = append(versions, am.version)
   258  	}
   259  	s := &Sender{
   260  		logger:        logger,
   261  		alertmanagers: alertmanagers,
   262  		versions:      versions,
   263  
   264  		sent: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   265  			Name: "thanos_alert_sender_alerts_sent_total",
   266  			Help: "Total number of alerts sent by alertmanager.",
   267  		}, []string{"alertmanager"}),
   268  
   269  		errs: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   270  			Name: "thanos_alert_sender_errors_total",
   271  			Help: "Total number of errors while sending alerts to alertmanager.",
   272  		}, []string{"alertmanager"}),
   273  
   274  		dropped: promauto.With(reg).NewCounter(prometheus.CounterOpts{
   275  			Name: "thanos_alert_sender_alerts_dropped_total",
   276  			Help: "Total number of alerts dropped in case of all sends to alertmanagers failed.",
   277  		}),
   278  
   279  		latency: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
   280  			Name: "thanos_alert_sender_latency_seconds",
   281  			Help: "Latency for sending alert notifications (not including dropped notifications).",
   282  		}, []string{"alertmanager"}),
   283  	}
   284  	return s
   285  }
   286  
   287  func toAPILabels(labels labels.Labels) models.LabelSet {
   288  	apiLabels := make(models.LabelSet, len(labels))
   289  	for _, label := range labels {
   290  		apiLabels[label.Name] = label.Value
   291  	}
   292  
   293  	return apiLabels
   294  }
   295  
   296  // Send an alert batch to all given Alertmanager clients.
   297  // TODO(bwplotka): https://github.com/thanos-io/thanos/issues/660.
   298  func (s *Sender) Send(ctx context.Context, alerts []*notifier.Alert) {
   299  	if len(alerts) == 0 {
   300  		return
   301  	}
   302  
   303  	payload := make(map[APIVersion][]byte)
   304  	for _, version := range s.versions {
   305  		var (
   306  			b   []byte
   307  			err error
   308  		)
   309  		switch version {
   310  		case APIv1:
   311  			if b, err = json.Marshal(alerts); err != nil {
   312  				level.Warn(s.logger).Log("msg", "encoding alerts for v1 API failed", "err", err)
   313  				return
   314  			}
   315  		case APIv2:
   316  			apiAlerts := make(models.PostableAlerts, 0, len(alerts))
   317  			for _, a := range alerts {
   318  				apiAlerts = append(apiAlerts, &models.PostableAlert{
   319  					Annotations: toAPILabels(a.Annotations),
   320  					EndsAt:      strfmt.DateTime(a.EndsAt),
   321  					StartsAt:    strfmt.DateTime(a.StartsAt),
   322  					Alert: models.Alert{
   323  						GeneratorURL: strfmt.URI(a.GeneratorURL),
   324  						Labels:       toAPILabels(a.Labels),
   325  					},
   326  				})
   327  			}
   328  			if b, err = json.Marshal(apiAlerts); err != nil {
   329  				level.Warn(s.logger).Log("msg", "encoding alerts for v2 API failed", "err", err)
   330  				return
   331  			}
   332  		}
   333  		payload[version] = b
   334  	}
   335  
   336  	var (
   337  		wg         sync.WaitGroup
   338  		numSuccess atomic.Uint64
   339  	)
   340  	for _, am := range s.alertmanagers {
   341  		for _, u := range am.dispatcher.Endpoints() {
   342  			wg.Add(1)
   343  			go func(am *Alertmanager, u url.URL) {
   344  				defer wg.Done()
   345  
   346  				level.Debug(s.logger).Log("msg", "sending alerts", "alertmanager", u.Host, "numAlerts", len(alerts))
   347  				start := time.Now()
   348  				u.Path = path.Join(u.Path, fmt.Sprintf("/api/%s/alerts", string(am.version)))
   349  
   350  				tracing.DoInSpan(ctx, "post_alerts HTTP[client]", func(ctx context.Context) {
   351  					if err := am.postAlerts(ctx, u, bytes.NewReader(payload[am.version])); err != nil {
   352  						level.Warn(s.logger).Log(
   353  							"msg", "sending alerts failed",
   354  							"alertmanager", u.Host,
   355  							"alerts", string(payload[am.version]),
   356  							"err", err,
   357  						)
   358  						s.errs.WithLabelValues(u.Host).Inc()
   359  						return
   360  					}
   361  					s.latency.WithLabelValues(u.Host).Observe(time.Since(start).Seconds())
   362  					s.sent.WithLabelValues(u.Host).Add(float64(len(alerts)))
   363  
   364  					numSuccess.Inc()
   365  				})
   366  			}(am, *u)
   367  		}
   368  	}
   369  	wg.Wait()
   370  
   371  	if numSuccess.Load() > 0 {
   372  		return
   373  	}
   374  
   375  	s.dropped.Add(float64(len(alerts)))
   376  	level.Warn(s.logger).Log("msg", "failed to send alerts to all alertmanagers", "numAlerts", len(alerts))
   377  }
   378  
   379  type Dispatcher interface {
   380  	// Endpoints returns the list of endpoint URLs the dispatcher knows about.
   381  	Endpoints() []*url.URL
   382  	// Do sends an HTTP request and returns a response.
   383  	Do(*http.Request) (*http.Response, error)
   384  }
   385  
   386  // Alertmanager is an HTTP client that can send alerts to a cluster of Alertmanager endpoints.
   387  type Alertmanager struct {
   388  	logger     log.Logger
   389  	dispatcher Dispatcher
   390  	timeout    time.Duration
   391  	version    APIVersion
   392  }
   393  
   394  // NewAlertmanager returns a new Alertmanager client.
   395  func NewAlertmanager(logger log.Logger, dispatcher Dispatcher, timeout time.Duration, version APIVersion) *Alertmanager {
   396  	if logger == nil {
   397  		logger = log.NewNopLogger()
   398  	}
   399  
   400  	return &Alertmanager{
   401  		logger:     logger,
   402  		dispatcher: dispatcher,
   403  		timeout:    timeout,
   404  		version:    version,
   405  	}
   406  }
   407  
   408  func (a *Alertmanager) postAlerts(ctx context.Context, u url.URL, r io.Reader) error {
   409  	req, err := http.NewRequest("POST", u.String(), r)
   410  	if err != nil {
   411  		return err
   412  	}
   413  	ctx, cancel := context.WithTimeout(ctx, a.timeout)
   414  	defer cancel()
   415  	req = req.WithContext(ctx)
   416  	req.Header.Set("Content-Type", contentTypeJSON)
   417  
   418  	resp, err := a.dispatcher.Do(req)
   419  	if err != nil {
   420  		return errors.Wrapf(err, "send request to %q", u.String())
   421  	}
   422  	defer runutil.ExhaustCloseWithLogOnErr(a.logger, resp.Body, "send one alert")
   423  
   424  	if resp.StatusCode/100 != 2 {
   425  		return errors.Errorf("bad response status %v from %q", resp.Status, u.String())
   426  	}
   427  	return nil
   428  }