k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/config/prow/cluster/monitoring/mixins/prometheus/tide_alerts.libsonnet (about) 1 { 2 prometheusAlerts+:: { 3 local tideName = $._config.components.tide, 4 local monitoringLink = $._config.instance.monitoringLink, 5 local prowURL = $._config.instance.url, 6 groups+: [ 7 { 8 name: 'Tide progress', 9 rules: [ 10 { 11 alert: 'Sync controller heartbeat', 12 expr: ||| 13 sum(increase(tidesyncheartbeat{controller="sync"}[15m])) < 1 14 |||, 15 'for': '5m', 16 labels: { 17 severity: 'critical', 18 slo: tideName, 19 }, 20 annotations: { 21 message: 'The Tide "sync" controller has not synced in 15 minutes. See the %s.' % monitoringLink('/d/d69a91f76d8110d3e72885ee5ce8038e/tide-dashboard?orgId=1&from=now-24h&to=now&fullscreen&panelId=7', 'processing time graph'), 22 }, 23 }, 24 { 25 alert: 'Status-update controller heartbeat', 26 expr: ||| 27 sum(increase(tidesyncheartbeat{controller="status-update"}[30m])) < 1 28 |||, 29 'for': '5m', 30 labels: { 31 severity: 'critical', 32 slo: tideName, 33 }, 34 annotations: { 35 message: 'The Tide "status-update" controller has not synced in 30 minutes. See the %s.' % monitoringLink('/d/d69a91f76d8110d3e72885ee5ce8038e/tide-dashboard?orgId=1&from=now-24h&to=now&fullscreen&panelId=7', 'processing time graph'), 36 }, 37 }, 38 { 39 alert: 'TidePool error rate: individual', 40 expr: ||| 41 ((sum(increase(tidepoolerrors{org!="kubeflow"}[10m])) by (org, repo, branch)) or vector(0)) >= 3 42 |||, 43 'for': '5m', 44 labels: { 45 severity: 'warning', 46 }, 47 annotations: { 48 message: 'At least one Tide pool encountered 3+ sync errors in a 10 minute window. If the TidePoolErrorRateMultiple alert has not fired this is likely an isolated configuration issue. See the <%s/tide-history|/tide-history> page and the %s.' % [prowURL, monitoringLink('/d/d69a91f76d8110d3e72885ee5ce8038e/tide-dashboard?orgId=1&fullscreen&panelId=6&from=now-24h&to=now', 'sync error graph')], 49 }, 50 }, 51 { 52 alert: 'TidePool error rate: multiple', 53 expr: ||| 54 (count(sum(increase(tidepoolerrors[10m])) by (org, repo) >= 3) or vector(0)) >= 3 55 |||, 56 'for': '5m', 57 labels: { 58 severity: 'critical', 59 slo: tideName, 60 }, 61 annotations: { 62 message: 'Tide encountered 3+ sync errors in a 10 minute window in at least 3 different repos that it handles. See the <%s/tide-history|/tide-history> page and the %s.' % [prowURL, monitoringLink('/d/d69a91f76d8110d3e72885ee5ce8038e/tide-dashboard?orgId=1&fullscreen&panelId=6&from=now-24h&to=now', 'sync error graph')], 63 }, 64 }, 65 ], 66 }, 67 ], 68 }, 69 }