k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/config/prow/cluster/monitoring/mixins/prometheus/slo_recordrules.libsonnet (about)

     1  {
     2    prometheusAlerts+:: {
     3      local components = $._config.slo.components,
     4  
     5      groups+: [
     6        {
     7         name: 'SLO Compliance',
     8         interval: '1m',
     9         rules: [
    10            {
    11              record: 'slo_component_ok',
    12              # We can't check for the absence of alerts without explicitly listing the components we are checking for. These are defined in config.
    13              # We want the SLO metrics to include alert-specific labels when SLO is violated. This means there may be multiple time series per component when out of SLO since multiple alerts may be firing, but all should have value 0.
    14              # If a component is SLO compliant there will be a single timeseries for that "slo" label: vector(1){slo="component-name"}
    15  
    16              local absents = std.join(
    17                ' or ',
    18                ['absent(ALERTS{alertstate="firing", slo="%s"})' % comp for comp in components],
    19              ),
    20              local allCompsRE = std.join('|', components),
    21  
    22              expr: |||
    23                min((%s) or (ALERTS{alertstate="firing", slo=~"%s"} - 1)) without (alertstate)
    24              ||| % [absents, allCompsRE],
    25  
    26              # Example compiled query for components=['tide', 'hook']
    27              # min((absent(ALERTS{alertstate="firing", slo="tide"}) or absent(ALERTS{alertstate="firing", slo="hook"})) or (ALERTS{alertstate="firing", slo=~"tide|hook"} - 1)) without (alertstate)
    28            },
    29            {
    30              record: 'slo_prow_ok',
    31              expr: '(vector(1) unless min(slo_component_ok == 0)) or (slo_component_ok == 0)',
    32            },
    33         ],
    34        },
    35      ],
    36    },
    37  }