k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/config/prow/cluster/monitoring/mixins/prometheus/prow_monitoring_absent_alerts.libsonnet (about) 1 { 2 prometheusAlerts+:: { 3 local componentName = $._config.components.monitoring, 4 groups+: [ 5 { 6 name: 'prow-monitoring-absent', 7 rules: [{ 8 alert: 'ServiceLostHA', 9 expr: ||| 10 sum(up{job=~"prometheus|alertmanager"}) by (job) <= 1 11 |||, 12 'for': '5m', 13 labels: { 14 severity: 'critical', 15 slo: componentName, 16 }, 17 annotations: { 18 message: 'The service {{ $labels.job }} has at most 1 instance for 5 minutes.', 19 }, 20 }] + [ 21 { 22 alert: '%sDown' % name, 23 expr: ||| 24 absent(up{job="%s"} == 1) 25 ||| % name, 26 'for': '5m', 27 labels: { 28 severity: 'critical', 29 slo: componentName, 30 }, 31 annotations: { 32 message: 'The service %s has been down for 5 minutes.' % name, 33 }, 34 } 35 for name in ['alertmanager', 'prometheus', 'grafana'] 36 ], 37 }, 38 ], 39 }, 40 }