k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/config/prow/cluster/monitoring/mixins/prometheus/prober_alerts.libsonnet

k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/config/prow/cluster/monitoring/mixins/prometheus/prober_alerts.libsonnet (about)

     1  {
     2    prometheusAlerts+:: {
     3      groups+: [
     4        {
     5          name: 'Blackbox Prober',
     6          rules: [
     7            {
     8              alert: 'Site unavailable: %s' % target.url,
     9              expr: |||
    10                min(probe_success{instance="%s"}) == 0
    11              ||| % target.url,
    12              'for': '2m', # I think this needs to be at least the scrape_interval and 2*evaluation_interval (which both default to 1m) in order to ignore individual probe failures.
    13              labels: {
    14                severity: 'critical',
    15              } + target.labels,
    16              annotations: {
    17                message: 'The blackbox_exporter HTTP probe has detected that the following site has been unhealthy (not 2xx HTTP response) for at least 2 minutes: <%s|%s>.' % [target.url, target.url],
    18              },
    19            }
    20            for target in $._config.probeTargets
    21          ],
    22        },
    23      ],
    24    },
    25  }