k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/config/prow/cluster/monitoring/mixins/prometheus/prober_alerts.libsonnet (about) 1 { 2 prometheusAlerts+:: { 3 groups+: [ 4 { 5 name: 'Blackbox Prober', 6 rules: [ 7 { 8 alert: 'Site unavailable: %s' % target.url, 9 expr: ||| 10 min(probe_success{instance="%s"}) == 0 11 ||| % target.url, 12 'for': '2m', # I think this needs to be at least the scrape_interval and 2*evaluation_interval (which both default to 1m) in order to ignore individual probe failures. 13 labels: { 14 severity: 'critical', 15 } + target.labels, 16 annotations: { 17 message: 'The blackbox_exporter HTTP probe has detected that the following site has been unhealthy (not 2xx HTTP response) for at least 2 minutes: <%s|%s>.' % [target.url, target.url], 18 }, 19 } 20 for target in $._config.probeTargets 21 ], 22 }, 23 ], 24 }, 25 }