k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/config/prow/cluster/monitoring/mixins/prometheus/plank_alerts.libsonnet (about)

     1  {
     2    prometheusAlerts+:: {
     3      local componentName = $._config.components.plank,
     4      groups+: [
     5        {
     6          name: 'Heartbeat ProwJobs',
     7          # To add more heartbeat PJs add entries to `heartbeatJobs` in config.libsonnet
     8          # NOTE: These alerts are associated with plank, but may be
     9          #       triggered by problems with horologium or the pod utils.
    10          rules: [
    11            {
    12              alert: 'No recent successful runs: `%s`' % job.name,
    13  
    14              # This query counts the number of PJs with the specified name that
    15              # transitioned to the success state in the last job.alertInterval
    16              # amount of time. If that number is < 1 we return a result causing
    17              # the alert to fire. (We use 0.5 instead of 1 because query
    18              # results are not precise integers due to how prometheus interpolates.)
    19              expr: |||
    20                sum(increase(prowjob_state_transitions{job_name="%s", state="success"}[%s])) < 0.5
    21              ||| % [job.name, job.alertInterval],
    22              labels: {
    23                severity: 'critical',
    24                slo: componentName,
    25              },
    26              annotations: {
    27                message: '@test-infra-oncall The heartbeat job `%s` has not had a successful run in the past %s (should run every %s).' % [job.name, job.alertInterval, job.interval],
    28              },
    29            }
    30            for job in $._config.heartbeatJobs
    31          ],
    32        },
    33      ],
    34    },
    35  }