github.com/thanos-io/thanos@v0.32.5/mixin/alerts/store.libsonnet (about)

     1  {
     2    local thanos = self,
     3    store+:: {
     4      selector: error 'must provide selector for Thanos Store alerts',
     5      grpcErrorThreshold: 5,
     6      compactionErrorThreshold: 5,
     7      seriesGateErrorThreshold: 2,
     8      bucketOpsErrorThreshold: 5,
     9      bucketOpsP99LatencyThreshold: 2,
    10      dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']),
    11    },
    12    prometheusAlerts+:: {
    13      groups+: if thanos.store == null then [] else [
    14        local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in %s' % std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else '';
    15        {
    16          name: 'thanos-store',
    17          rules: [
    18            {
    19              alert: 'ThanosStoreGrpcErrorRate',
    20              annotations: {
    21                description: 'Thanos Store {{$labels.job}}%s is failing to handle {{$value | humanize}}%% of requests.' % location,
    22                summary: 'Thanos Store is failing to handle gRPC requests.',
    23              },
    24              expr: |||
    25                (
    26                  sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m]))
    27                /
    28                  sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s}[5m]))
    29                * 100 > %(grpcErrorThreshold)s
    30                )
    31              ||| % thanos.store,
    32              'for': '5m',
    33              labels: {
    34                severity: 'warning',
    35              },
    36            },
    37            {
    38              alert: 'ThanosStoreSeriesGateLatencyHigh',
    39              annotations: {
    40                description: 'Thanos Store {{$labels.job}}%s has a 99th percentile latency of {{$value}} seconds for store series gate requests.' % location,
    41                summary: 'Thanos Store has high latency for store series gate requests.',
    42              },
    43              expr: |||
    44                (
    45                  histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{%(selector)s}[5m]))) > %(seriesGateErrorThreshold)s
    46                and
    47                  sum by (%(dimensions)s) (rate(thanos_bucket_store_series_gate_duration_seconds_count{%(selector)s}[5m])) > 0
    48                )
    49              ||| % thanos.store,
    50              'for': '10m',
    51              labels: {
    52                severity: 'warning',
    53              },
    54            },
    55            {
    56              alert: 'ThanosStoreBucketHighOperationFailures',
    57              annotations: {
    58                description: 'Thanos Store {{$labels.job}}%s Bucket is failing to execute {{$value | humanize}}%% of operations.' % location,
    59                summary: 'Thanos Store Bucket is failing to execute operations.',
    60              },
    61              expr: |||
    62                (
    63                  sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m]))
    64                /
    65                  sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m]))
    66                * 100 > %(bucketOpsErrorThreshold)s
    67                )
    68              ||| % thanos.store,
    69              'for': '15m',
    70              labels: {
    71                severity: 'warning',
    72              },
    73            },
    74            {
    75              alert: 'ThanosStoreObjstoreOperationLatencyHigh',
    76              annotations: {
    77                description: 'Thanos Store {{$labels.job}}%s Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.' % location,
    78                summary: 'Thanos Store is having high latency for bucket operations.',
    79              },
    80              expr: |||
    81                (
    82                  histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{%(selector)s}[5m]))) > %(bucketOpsP99LatencyThreshold)s
    83                and
    84                  sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operation_duration_seconds_count{%(selector)s}[5m])) > 0
    85                )
    86              ||| % thanos.store,
    87              'for': '10m',
    88              labels: {
    89                severity: 'warning',
    90              },
    91            },
    92          ],
    93        },
    94      ],
    95    },
    96  }