github.com/thanos-io/thanos@v0.32.5/pkg/alert/alert.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 // Package alert contains logic to send alert notifications to Alertmanager clusters. 5 package alert 6 7 import ( 8 "bytes" 9 "context" 10 "encoding/json" 11 "fmt" 12 "io" 13 "net/http" 14 "net/url" 15 "path" 16 "sync" 17 "time" 18 19 "github.com/go-kit/log" 20 "github.com/go-kit/log/level" 21 "github.com/go-openapi/strfmt" 22 "github.com/pkg/errors" 23 "github.com/prometheus/alertmanager/api/v2/models" 24 "github.com/prometheus/client_golang/prometheus" 25 "github.com/prometheus/client_golang/prometheus/promauto" 26 "github.com/prometheus/prometheus/model/labels" 27 "github.com/prometheus/prometheus/model/relabel" 28 "github.com/prometheus/prometheus/notifier" 29 "go.uber.org/atomic" 30 31 "github.com/thanos-io/thanos/pkg/runutil" 32 "github.com/thanos-io/thanos/pkg/tracing" 33 ) 34 35 const ( 36 defaultAlertmanagerPort = 9093 37 contentTypeJSON = "application/json" 38 ) 39 40 // Queue is a queue of alert notifications waiting to be sent. The queue is consumed in batches 41 // and entries are dropped at the front if it runs full. 42 type Queue struct { 43 logger log.Logger 44 maxBatchSize int 45 capacity int 46 toAddLset labels.Labels 47 toExcludeLabels labels.Labels 48 alertRelabelConfigs []*relabel.Config 49 50 mtx sync.Mutex 51 queue []*notifier.Alert 52 morec chan struct{} 53 54 pushed prometheus.Counter 55 popped prometheus.Counter 56 dropped prometheus.Counter 57 } 58 59 func relabelLabels(lset labels.Labels, excludeLset []string) (toAdd, toExclude labels.Labels) { 60 for _, ln := range excludeLset { 61 toExclude = append(toExclude, labels.Label{Name: ln}) 62 } 63 64 for _, l := range lset { 65 // Exclude labels to to add straight away. 66 if toExclude.Has(l.Name) { 67 continue 68 } 69 toAdd = append(toAdd, labels.Label{ 70 Name: l.Name, 71 Value: l.Value, 72 }) 73 } 74 return toAdd, toExclude 75 } 76 77 // NewQueue returns a new queue. The given label set is attached to all alerts pushed to the queue. 78 // The given exclude label set tells what label names to drop including external labels. 79 func NewQueue(logger log.Logger, reg prometheus.Registerer, capacity, maxBatchSize int, externalLset labels.Labels, excludeLabels []string, alertRelabelConfigs []*relabel.Config) *Queue { 80 toAdd, toExclude := relabelLabels(externalLset, excludeLabels) 81 82 if logger == nil { 83 logger = log.NewNopLogger() 84 } 85 q := &Queue{ 86 logger: logger, 87 capacity: capacity, 88 morec: make(chan struct{}, 1), 89 maxBatchSize: maxBatchSize, 90 toAddLset: toAdd, 91 toExcludeLabels: toExclude, 92 alertRelabelConfigs: alertRelabelConfigs, 93 94 dropped: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 95 Name: "thanos_alert_queue_alerts_dropped_total", 96 Help: "Total number of alerts that were dropped from the queue.", 97 }), 98 pushed: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 99 Name: "thanos_alert_queue_alerts_pushed_total", 100 Help: "Total number of alerts pushed to the queue.", 101 }), 102 popped: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 103 Name: "thanos_alert_queue_alerts_popped_total", 104 Help: "Total number of alerts popped from the queue.", 105 }), 106 } 107 _ = promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ 108 Name: "thanos_alert_queue_capacity", 109 Help: "Capacity of the alert queue.", 110 }, func() float64 { 111 return float64(q.Cap()) 112 }) 113 _ = promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ 114 Name: "thanos_alert_queue_length", 115 Help: "Length of the alert queue.", 116 }, func() float64 { 117 return float64(q.Len()) 118 }) 119 return q 120 } 121 122 // Len returns the current length of the queue. 123 func (q *Queue) Len() int { 124 q.mtx.Lock() 125 defer q.mtx.Unlock() 126 return len(q.queue) 127 } 128 129 // Cap returns the fixed capacity of the queue. 130 func (q *Queue) Cap() int { 131 return q.capacity 132 } 133 134 // Pop takes a batch of alerts from the front of the queue. The batch size is limited 135 // according to the queues maxBatchSize limit. 136 // It blocks until elements are available or a termination signal is send on termc. 137 func (q *Queue) Pop(termc <-chan struct{}) []*notifier.Alert { 138 select { 139 case <-termc: 140 return nil 141 case <-q.morec: 142 } 143 144 q.mtx.Lock() 145 defer q.mtx.Unlock() 146 147 as := make([]*notifier.Alert, q.maxBatchSize) 148 n := copy(as, q.queue) 149 q.queue = q.queue[n:] 150 151 q.popped.Add(float64(n)) 152 153 if len(q.queue) > 0 { 154 select { 155 case q.morec <- struct{}{}: 156 default: 157 } 158 } 159 return as[:n] 160 } 161 162 // Push adds a list of alerts to the queue. 163 func (q *Queue) Push(alerts []*notifier.Alert) { 164 if len(alerts) == 0 { 165 return 166 } 167 168 q.mtx.Lock() 169 defer q.mtx.Unlock() 170 171 q.pushed.Add(float64(len(alerts))) 172 173 // Attach external labels, drop excluded labels and process relabeling before sending. 174 var relabeledAlerts []*notifier.Alert 175 for _, a := range alerts { 176 lb := labels.NewBuilder(labels.Labels{}) 177 for _, l := range a.Labels { 178 if q.toExcludeLabels.Has(l.Name) { 179 continue 180 } 181 lb.Set(l.Name, l.Value) 182 } 183 for _, l := range q.toAddLset { 184 lb.Set(l.Name, l.Value) 185 } 186 187 if lset, keep := relabel.Process(lb.Labels(), q.alertRelabelConfigs...); keep { 188 a.Labels = lset 189 relabeledAlerts = append(relabeledAlerts, a) 190 } 191 } 192 193 alerts = relabeledAlerts 194 if len(alerts) == 0 { 195 return 196 } 197 // Queue capacity should be significantly larger than a single alert 198 // batch could be. 199 if d := len(alerts) - q.capacity; d > 0 { 200 alerts = alerts[d:] 201 202 level.Warn(q.logger).Log( 203 "msg", "Alert batch larger than queue capacity, dropping alerts", 204 "numDropped", d) 205 q.dropped.Add(float64(d)) 206 } 207 208 // If the queue is full, remove the oldest alerts in favor 209 // of newer ones. 210 if d := (len(q.queue) + len(alerts)) - q.capacity; d > 0 { 211 q.queue = q.queue[d:] 212 213 level.Warn(q.logger).Log( 214 "msg", "Alert notification queue full, dropping alerts", 215 "numDropped", d) 216 q.dropped.Add(float64(d)) 217 } 218 219 q.queue = append(q.queue, alerts...) 220 221 select { 222 case q.morec <- struct{}{}: 223 default: 224 } 225 } 226 227 // Sender sends notifications to a dynamic set of alertmanagers. 228 type Sender struct { 229 logger log.Logger 230 alertmanagers []*Alertmanager 231 versions []APIVersion 232 233 sent *prometheus.CounterVec 234 errs *prometheus.CounterVec 235 dropped prometheus.Counter 236 latency *prometheus.HistogramVec 237 } 238 239 // NewSender returns a new sender. On each call to Send the entire alert batch is sent 240 // to each Alertmanager returned by the getter function. 241 func NewSender( 242 logger log.Logger, 243 reg prometheus.Registerer, 244 alertmanagers []*Alertmanager, 245 ) *Sender { 246 if logger == nil { 247 logger = log.NewNopLogger() 248 } 249 var ( 250 versions []APIVersion 251 versionPresent map[APIVersion]struct{} 252 ) 253 for _, am := range alertmanagers { 254 if _, found := versionPresent[am.version]; found { 255 continue 256 } 257 versions = append(versions, am.version) 258 } 259 s := &Sender{ 260 logger: logger, 261 alertmanagers: alertmanagers, 262 versions: versions, 263 264 sent: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 265 Name: "thanos_alert_sender_alerts_sent_total", 266 Help: "Total number of alerts sent by alertmanager.", 267 }, []string{"alertmanager"}), 268 269 errs: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 270 Name: "thanos_alert_sender_errors_total", 271 Help: "Total number of errors while sending alerts to alertmanager.", 272 }, []string{"alertmanager"}), 273 274 dropped: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 275 Name: "thanos_alert_sender_alerts_dropped_total", 276 Help: "Total number of alerts dropped in case of all sends to alertmanagers failed.", 277 }), 278 279 latency: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ 280 Name: "thanos_alert_sender_latency_seconds", 281 Help: "Latency for sending alert notifications (not including dropped notifications).", 282 }, []string{"alertmanager"}), 283 } 284 return s 285 } 286 287 func toAPILabels(labels labels.Labels) models.LabelSet { 288 apiLabels := make(models.LabelSet, len(labels)) 289 for _, label := range labels { 290 apiLabels[label.Name] = label.Value 291 } 292 293 return apiLabels 294 } 295 296 // Send an alert batch to all given Alertmanager clients. 297 // TODO(bwplotka): https://github.com/thanos-io/thanos/issues/660. 298 func (s *Sender) Send(ctx context.Context, alerts []*notifier.Alert) { 299 if len(alerts) == 0 { 300 return 301 } 302 303 payload := make(map[APIVersion][]byte) 304 for _, version := range s.versions { 305 var ( 306 b []byte 307 err error 308 ) 309 switch version { 310 case APIv1: 311 if b, err = json.Marshal(alerts); err != nil { 312 level.Warn(s.logger).Log("msg", "encoding alerts for v1 API failed", "err", err) 313 return 314 } 315 case APIv2: 316 apiAlerts := make(models.PostableAlerts, 0, len(alerts)) 317 for _, a := range alerts { 318 apiAlerts = append(apiAlerts, &models.PostableAlert{ 319 Annotations: toAPILabels(a.Annotations), 320 EndsAt: strfmt.DateTime(a.EndsAt), 321 StartsAt: strfmt.DateTime(a.StartsAt), 322 Alert: models.Alert{ 323 GeneratorURL: strfmt.URI(a.GeneratorURL), 324 Labels: toAPILabels(a.Labels), 325 }, 326 }) 327 } 328 if b, err = json.Marshal(apiAlerts); err != nil { 329 level.Warn(s.logger).Log("msg", "encoding alerts for v2 API failed", "err", err) 330 return 331 } 332 } 333 payload[version] = b 334 } 335 336 var ( 337 wg sync.WaitGroup 338 numSuccess atomic.Uint64 339 ) 340 for _, am := range s.alertmanagers { 341 for _, u := range am.dispatcher.Endpoints() { 342 wg.Add(1) 343 go func(am *Alertmanager, u url.URL) { 344 defer wg.Done() 345 346 level.Debug(s.logger).Log("msg", "sending alerts", "alertmanager", u.Host, "numAlerts", len(alerts)) 347 start := time.Now() 348 u.Path = path.Join(u.Path, fmt.Sprintf("/api/%s/alerts", string(am.version))) 349 350 tracing.DoInSpan(ctx, "post_alerts HTTP[client]", func(ctx context.Context) { 351 if err := am.postAlerts(ctx, u, bytes.NewReader(payload[am.version])); err != nil { 352 level.Warn(s.logger).Log( 353 "msg", "sending alerts failed", 354 "alertmanager", u.Host, 355 "alerts", string(payload[am.version]), 356 "err", err, 357 ) 358 s.errs.WithLabelValues(u.Host).Inc() 359 return 360 } 361 s.latency.WithLabelValues(u.Host).Observe(time.Since(start).Seconds()) 362 s.sent.WithLabelValues(u.Host).Add(float64(len(alerts))) 363 364 numSuccess.Inc() 365 }) 366 }(am, *u) 367 } 368 } 369 wg.Wait() 370 371 if numSuccess.Load() > 0 { 372 return 373 } 374 375 s.dropped.Add(float64(len(alerts))) 376 level.Warn(s.logger).Log("msg", "failed to send alerts to all alertmanagers", "numAlerts", len(alerts)) 377 } 378 379 type Dispatcher interface { 380 // Endpoints returns the list of endpoint URLs the dispatcher knows about. 381 Endpoints() []*url.URL 382 // Do sends an HTTP request and returns a response. 383 Do(*http.Request) (*http.Response, error) 384 } 385 386 // Alertmanager is an HTTP client that can send alerts to a cluster of Alertmanager endpoints. 387 type Alertmanager struct { 388 logger log.Logger 389 dispatcher Dispatcher 390 timeout time.Duration 391 version APIVersion 392 } 393 394 // NewAlertmanager returns a new Alertmanager client. 395 func NewAlertmanager(logger log.Logger, dispatcher Dispatcher, timeout time.Duration, version APIVersion) *Alertmanager { 396 if logger == nil { 397 logger = log.NewNopLogger() 398 } 399 400 return &Alertmanager{ 401 logger: logger, 402 dispatcher: dispatcher, 403 timeout: timeout, 404 version: version, 405 } 406 } 407 408 func (a *Alertmanager) postAlerts(ctx context.Context, u url.URL, r io.Reader) error { 409 req, err := http.NewRequest("POST", u.String(), r) 410 if err != nil { 411 return err 412 } 413 ctx, cancel := context.WithTimeout(ctx, a.timeout) 414 defer cancel() 415 req = req.WithContext(ctx) 416 req.Header.Set("Content-Type", contentTypeJSON) 417 418 resp, err := a.dispatcher.Do(req) 419 if err != nil { 420 return errors.Wrapf(err, "send request to %q", u.String()) 421 } 422 defer runutil.ExhaustCloseWithLogOnErr(a.logger, resp.Body, "send one alert") 423 424 if resp.StatusCode/100 != 2 { 425 return errors.Errorf("bad response status %v from %q", resp.Status, u.String()) 426 } 427 return nil 428 }