github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/production/loki-mixin/alerts.libsonnet (about)

     1  {
     2    prometheusAlerts+:: {
     3      groups+: [
     4        {
     5          name: 'loki_alerts',
     6          rules: [
     7            {
     8              alert: 'LokiRequestErrors',
     9              expr: |||
    10                100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
    11                  /
    12                sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
    13                  > 10
    14              |||,
    15              'for': '15m',
    16              labels: {
    17                severity: 'critical',
    18              },
    19              annotations: {
    20                message: |||
    21                  {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
    22                |||,
    23              },
    24            },
    25            {
    26              alert: 'LokiRequestPanics',
    27              expr: |||
    28                sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
    29              |||,
    30              labels: {
    31                severity: 'critical',
    32              },
    33              annotations: {
    34                message: |||
    35                  {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
    36                |||,
    37              },
    38            },
    39            {
    40              alert: 'LokiRequestLatency',
    41              expr: |||
    42                namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1
    43              |||,
    44              'for': '15m',
    45              labels: {
    46                severity: 'critical',
    47              },
    48              annotations: {
    49                message: |||
    50                  {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
    51                |||,
    52              },
    53            },
    54            {
    55              alert: 'LokiTooManyCompactorsRunning',
    56              expr: |||
    57                sum(loki_boltdb_shipper_compactor_running) by (namespace) > 1
    58              |||,
    59              'for': '5m',
    60              labels: {
    61                severity: 'warning',
    62              },
    63              annotations: {
    64                message: |||
    65                  {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
    66                |||,
    67              },
    68            },
    69          ],
    70        },
    71      ],
    72    },
    73  }