github.com/verrazzano/verrazzano@v1.7.0/tools/psr/backend/workers/prometheus/alerts/alerts.go (about)

     1  // Copyright (c) 2022, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  package alerts
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"io"
    10  	"net/http"
    11  	"sync/atomic"
    12  	"time"
    13  
    14  	"github.com/verrazzano/verrazzano/pkg/httputil"
    15  	"github.com/verrazzano/verrazzano/pkg/log/vzlog"
    16  	"github.com/verrazzano/verrazzano/tools/psr/backend/config"
    17  	"github.com/verrazzano/verrazzano/tools/psr/backend/metrics"
    18  	"github.com/verrazzano/verrazzano/tools/psr/backend/osenv"
    19  	"github.com/verrazzano/verrazzano/tools/psr/backend/pkg/k8sclient"
    20  	psrprom "github.com/verrazzano/verrazzano/tools/psr/backend/pkg/prometheus"
    21  	psrvz "github.com/verrazzano/verrazzano/tools/psr/backend/pkg/verrazzano"
    22  	"github.com/verrazzano/verrazzano/tools/psr/backend/spi"
    23  
    24  	"github.com/prometheus/client_golang/prometheus"
    25  	corev1 "k8s.io/api/core/v1"
    26  	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    27  	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
    28  )
    29  
    30  // metricsPrefix is the prefix that is automatically pre-pended to all metrics exported by this worker.
    31  const metricsPrefix = "prom_alert"
    32  
    33  var funcNewPsrClient = k8sclient.NewPsrClient
    34  
    35  type worker struct {
    36  	metricDescList []prometheus.Desc
    37  	*workerMetrics
    38  }
    39  
    40  var _ spi.Worker = worker{}
    41  
    42  // workerMetrics holds the metrics produced by the worker. Metrics must be thread safe.
    43  type workerMetrics struct {
    44  	alertsFiringCount   metrics.MetricItem
    45  	alertsResolvedCount metrics.MetricItem
    46  }
    47  
    48  func NewAlertsWorker() (spi.Worker, error) {
    49  	w := worker{workerMetrics: &workerMetrics{
    50  		alertsFiringCount: metrics.MetricItem{
    51  			Name: "alerts_firing_received_count",
    52  			Help: "The total number of alerts received from alertmanager",
    53  			Type: prometheus.CounterValue,
    54  		},
    55  		alertsResolvedCount: metrics.MetricItem{
    56  			Name: "alerts_resolved_received_count",
    57  			Help: "The total number of alerts received from alertmanager",
    58  			Type: prometheus.CounterValue,
    59  		},
    60  	}}
    61  
    62  	if err := config.PsrEnv.LoadFromEnv(w.GetEnvDescList()); err != nil {
    63  		return w, err
    64  	}
    65  
    66  	metricsLabels := map[string]string{
    67  		config.PsrWorkerTypeMetricsName: config.PsrEnv.GetEnv(config.PsrWorkerType),
    68  	}
    69  
    70  	w.metricDescList = metrics.BuildMetricDescList([]*metrics.MetricItem{
    71  		&w.alertsFiringCount,
    72  		&w.alertsResolvedCount,
    73  	}, metricsLabels, w.GetWorkerDesc().MetricsPrefix)
    74  	return w, nil
    75  }
    76  
    77  // GetWorkerDesc returns the WorkerDesc for the worker
    78  func (w worker) GetWorkerDesc() spi.WorkerDesc {
    79  	return spi.WorkerDesc{
    80  		WorkerType:    config.WorkerTypeHTTPGet,
    81  		Description:   "The alerts receiver worker configures alertmanger and receives alerts and writes them to events",
    82  		MetricsPrefix: metricsPrefix,
    83  	}
    84  }
    85  
    86  func (w worker) GetEnvDescList() []osenv.EnvVarDesc {
    87  	return []osenv.EnvVarDesc{}
    88  }
    89  
    90  func (w worker) GetMetricDescList() []prometheus.Desc {
    91  	return w.metricDescList
    92  }
    93  
    94  func (w worker) GetMetricList() []prometheus.Metric {
    95  	return []prometheus.Metric{
    96  		w.alertsFiringCount.BuildMetric(),
    97  		w.alertsResolvedCount.BuildMetric(),
    98  	}
    99  }
   100  
   101  func (w worker) WantLoopInfoLogged() bool {
   102  	return false
   103  }
   104  
   105  func (w worker) PreconditionsMet() (bool, error) {
   106  	return true, nil
   107  }
   108  
   109  func (w worker) DoWork(conf config.CommonConfig, log vzlog.VerrazzanoLogger) error {
   110  	if err := updateVZForAlertmanager(log, conf); err != nil {
   111  		return err
   112  	}
   113  
   114  	http.HandleFunc("/alerts", func(rw http.ResponseWriter, r *http.Request) {
   115  		c, err := funcNewPsrClient()
   116  		if err != nil {
   117  			log.Errorf("error creating client: %v", err)
   118  		}
   119  		if r.Body == nil {
   120  			log.Errorf("Alert webhook POST request contained a nil body")
   121  			return
   122  		}
   123  		bodyRaw, err := io.ReadAll(r.Body)
   124  		if err != nil {
   125  			log.Errorf("Unexpected error while reading request body: %v", err)
   126  			return
   127  		}
   128  		r.Body.Close()
   129  		alertName, err := httputil.ExtractFieldFromResponseBodyOrReturnError(string(bodyRaw), "alerts.0.labels.alertname", "unable to extract alertname from body json")
   130  		if err != nil {
   131  			log.Error(err)
   132  		}
   133  		alertStatus, err := httputil.ExtractFieldFromResponseBodyOrReturnError(string(bodyRaw), "status", "unable to extract alert status from body json")
   134  		if err != nil {
   135  			log.Error(err)
   136  		}
   137  		if alertStatus == "firing" {
   138  			atomic.AddInt64(&w.workerMetrics.alertsFiringCount.Val, 1)
   139  		} else if alertStatus == "resolved" {
   140  			atomic.AddInt64(&w.workerMetrics.alertsResolvedCount.Val, 1)
   141  		} else {
   142  			log.Errorf("alert received with unknown status: %s", alertStatus)
   143  		}
   144  
   145  		event := corev1.Event{
   146  			ObjectMeta: v1.ObjectMeta{
   147  				Name:      "psr-alert-" + alertName,
   148  				Namespace: config.PsrEnv.GetEnv(config.PsrWorkerNamespace),
   149  			},
   150  			InvolvedObject: corev1.ObjectReference{
   151  				Namespace: config.PsrEnv.GetEnv(config.PsrWorkerNamespace),
   152  			},
   153  			Type: "Warning",
   154  		}
   155  		if _, err = controllerutil.CreateOrUpdate(context.TODO(), c.CrtlRuntime, &event, func() error {
   156  			event.LastTimestamp = v1.Time{Time: time.Now()}
   157  			event.Message = string(bodyRaw)
   158  			event.Reason = "Alert " + alertStatus
   159  			return nil
   160  		}); err != nil {
   161  			log.Errorf("error generating alert event: %v", err)
   162  		}
   163  	})
   164  	select {}
   165  }
   166  
   167  func updateVZForAlertmanager(log vzlog.VerrazzanoLogger, conf config.CommonConfig) error {
   168  	if err := createAlertmanagerOverridesCM(log, conf); err != nil {
   169  		return err
   170  	}
   171  
   172  	c, err := funcNewPsrClient()
   173  	if err != nil {
   174  		log.Errorf("error creating client: %v", err)
   175  	}
   176  	cr, err := psrvz.GetVerrazzano(c.VzInstall)
   177  	if err != nil {
   178  		return err
   179  	}
   180  	var m psrprom.AlertmanagerConfigModifier
   181  	m.ModifyCR(cr)
   182  
   183  	return psrvz.UpdateVerrazzano(c.VzInstall, cr)
   184  }
   185  
   186  func createAlertmanagerOverridesCM(log vzlog.VerrazzanoLogger, conf config.CommonConfig) error {
   187  	c, err := funcNewPsrClient()
   188  	if err != nil {
   189  		log.Errorf("error creating client: %v", err)
   190  	}
   191  	cr, err := psrvz.GetVerrazzano(c.VzInstall)
   192  	if err != nil {
   193  		return err
   194  	}
   195  	cm := corev1.ConfigMap{
   196  		ObjectMeta: v1.ObjectMeta{
   197  			Name:      psrprom.AlertmanagerCMName,
   198  			Namespace: cr.Namespace,
   199  		},
   200  		Data: map[string]string{},
   201  	}
   202  	_, err = controllerutil.CreateOrUpdate(context.TODO(), c.CrtlRuntime, &cm, func() error {
   203  		cm.Data = map[string]string{
   204  			psrprom.AlertmanagerCMKey: fmt.Sprintf(`alertmanager:
   205    alertmanagerSpec:
   206      podMetadata:
   207        annotations:
   208          sidecar.istio.io/inject: "false"
   209    config:
   210      receivers:
   211      - webhook_configs:
   212        - url: http://%s-%s.%s:9090/alerts
   213        name: webhook
   214      route:
   215        group_by:
   216        - alertname
   217        receiver: webhook
   218        routes:
   219        - match:
   220            alertname: Watchdog
   221          receiver: webhook
   222    enabled: true
   223  `, conf.ReleaseName, conf.WorkerType, conf.Namespace,
   224  			)}
   225  		return nil
   226  	})
   227  	return err
   228  }