k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/config/prow/cluster/monitoring/mixins/prometheus/plank_alerts.libsonnet (about) 1 { 2 prometheusAlerts+:: { 3 local componentName = $._config.components.plank, 4 groups+: [ 5 { 6 name: 'Heartbeat ProwJobs', 7 # To add more heartbeat PJs add entries to `heartbeatJobs` in config.libsonnet 8 # NOTE: These alerts are associated with plank, but may be 9 # triggered by problems with horologium or the pod utils. 10 rules: [ 11 { 12 alert: 'No recent successful runs: `%s`' % job.name, 13 14 # This query counts the number of PJs with the specified name that 15 # transitioned to the success state in the last job.alertInterval 16 # amount of time. If that number is < 1 we return a result causing 17 # the alert to fire. (We use 0.5 instead of 1 because query 18 # results are not precise integers due to how prometheus interpolates.) 19 expr: ||| 20 sum(increase(prowjob_state_transitions{job_name="%s", state="success"}[%s])) < 0.5 21 ||| % [job.name, job.alertInterval], 22 labels: { 23 severity: 'critical', 24 slo: componentName, 25 }, 26 annotations: { 27 message: '@test-infra-oncall The heartbeat job `%s` has not had a successful run in the past %s (should run every %s).' % [job.name, job.alertInterval, job.interval], 28 }, 29 } 30 for job in $._config.heartbeatJobs 31 ], 32 }, 33 ], 34 }, 35 }