github.com/thanos-io/thanos@v0.32.5/mixin/alerts/bucket-replicate.libsonnet (about)

     1  {
     2    local thanos = self,
     3    bucketReplicate+:: {
     4      selector: error 'must provide selector for Thanos Bucket Replicate dashboard',
     5      errorThreshold: 10,
     6      p99LatencyThreshold: 20,
     7      dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']),
     8    },
     9    prometheusAlerts+:: {
    10      groups+: if thanos.bucketReplicate == null then [] else [
    11        local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in %s' % std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else '';
    12        {
    13          name: 'thanos-bucket-replicate',
    14          rules: [
    15            {
    16              alert: 'ThanosBucketReplicateErrorRate',
    17              annotations: {
    18                description: 'Thanos Replicate is failing to run%s, {{$value | humanize}}%% of attempts failed.' % location,
    19                summary: 'Thanos Replicate is failing to run%s.' % location,
    20              },
    21              expr: |||
    22                (
    23                  sum by (%(dimensions)s) (rate(thanos_replicate_replication_runs_total{result="error", %(selector)s}[5m]))
    24                / on (%(dimensions)s) group_left
    25                  sum by (%(dimensions)s) (rate(thanos_replicate_replication_runs_total{%(selector)s}[5m]))
    26                ) * 100 >= %(errorThreshold)s
    27              ||| % thanos.bucketReplicate,
    28              'for': '5m',
    29              labels: {
    30                severity: 'critical',
    31              },
    32            },
    33            {
    34              alert: 'ThanosBucketReplicateRunLatency',
    35              annotations: {
    36                description: 'Thanos Replicate {{$labels.job}}%s has a 99th percentile latency of {{$value}} seconds for the replicate operations.' % location,
    37                summary: 'Thanos Replicate has a high latency for replicate operations.',
    38              },
    39              expr: |||
    40                (
    41                  histogram_quantile(0.99, sum by (%(dimensions)s) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m]))) > %(p99LatencyThreshold)s
    42                and
    43                  sum by (%(dimensions)s) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m])) > 0
    44                )
    45              ||| % thanos.bucketReplicate,
    46              'for': '5m',
    47              labels: {
    48                severity: 'critical',
    49              },
    50            },
    51          ],
    52        },
    53      ],
    54    },
    55  }