k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/config/prow/cluster/monitoring/mixins/prometheus/tide_alerts.libsonnet (about)

     1  {
     2    prometheusAlerts+:: {
     3      local tideName = $._config.components.tide,
     4      local monitoringLink = $._config.instance.monitoringLink,
     5      local prowURL = $._config.instance.url,
     6      groups+: [
     7        {
     8          name: 'Tide progress',
     9          rules: [
    10            {
    11              alert: 'Sync controller heartbeat',
    12              expr: |||
    13                sum(increase(tidesyncheartbeat{controller="sync"}[15m])) < 1
    14              |||,
    15              'for': '5m',
    16              labels: {
    17                severity: 'critical',
    18                slo: tideName,
    19              },
    20              annotations: {
    21                message: 'The Tide "sync" controller has not synced in 15 minutes. See the %s.' % monitoringLink('/d/d69a91f76d8110d3e72885ee5ce8038e/tide-dashboard?orgId=1&from=now-24h&to=now&fullscreen&panelId=7', 'processing time graph'),
    22              },
    23            },
    24            {
    25              alert: 'Status-update controller heartbeat',
    26              expr: |||
    27                sum(increase(tidesyncheartbeat{controller="status-update"}[30m])) < 1
    28              |||,
    29              'for': '5m',
    30              labels: {
    31                severity: 'critical',
    32                slo: tideName,
    33              },
    34              annotations: {
    35                message: 'The Tide "status-update" controller has not synced in 30 minutes. See the %s.' % monitoringLink('/d/d69a91f76d8110d3e72885ee5ce8038e/tide-dashboard?orgId=1&from=now-24h&to=now&fullscreen&panelId=7', 'processing time graph'),
    36              },
    37            },
    38            {
    39              alert: 'TidePool error rate: individual',
    40              expr: |||
    41                ((sum(increase(tidepoolerrors{org!="kubeflow"}[10m])) by (org, repo, branch)) or vector(0)) >= 3
    42              |||,
    43              'for': '5m',
    44              labels: {
    45                severity: 'warning',
    46              },
    47              annotations: {
    48                message: 'At least one Tide pool encountered 3+ sync errors in a 10 minute window. If the TidePoolErrorRateMultiple alert has not fired this is likely an isolated configuration issue. See the <%s/tide-history|/tide-history> page and the %s.' % [prowURL, monitoringLink('/d/d69a91f76d8110d3e72885ee5ce8038e/tide-dashboard?orgId=1&fullscreen&panelId=6&from=now-24h&to=now', 'sync error graph')],
    49              },
    50            },
    51            {
    52              alert: 'TidePool error rate: multiple',
    53              expr: |||
    54                (count(sum(increase(tidepoolerrors[10m])) by (org, repo) >= 3) or vector(0)) >= 3
    55              |||,
    56              'for': '5m',
    57              labels: {
    58                severity: 'critical',
    59                slo: tideName,
    60              },
    61              annotations: {
    62                message: 'Tide encountered 3+ sync errors in a 10 minute window in at least 3 different repos that it handles. See the <%s/tide-history|/tide-history> page and the %s.' % [prowURL, monitoringLink('/d/d69a91f76d8110d3e72885ee5ce8038e/tide-dashboard?orgId=1&fullscreen&panelId=6&from=now-24h&to=now', 'sync error graph')],
    63              },
    64            },
    65          ],
    66        },
    67      ],
    68    },
    69  }