github.com/thanos-io/thanos@v0.32.5/mixin/alerts/store.libsonnet (about) 1 { 2 local thanos = self, 3 store+:: { 4 selector: error 'must provide selector for Thanos Store alerts', 5 grpcErrorThreshold: 5, 6 compactionErrorThreshold: 5, 7 seriesGateErrorThreshold: 2, 8 bucketOpsErrorThreshold: 5, 9 bucketOpsP99LatencyThreshold: 2, 10 dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']), 11 }, 12 prometheusAlerts+:: { 13 groups+: if thanos.store == null then [] else [ 14 local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in %s' % std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else ''; 15 { 16 name: 'thanos-store', 17 rules: [ 18 { 19 alert: 'ThanosStoreGrpcErrorRate', 20 annotations: { 21 description: 'Thanos Store {{$labels.job}}%s is failing to handle {{$value | humanize}}%% of requests.' % location, 22 summary: 'Thanos Store is failing to handle gRPC requests.', 23 }, 24 expr: ||| 25 ( 26 sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) 27 / 28 sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s}[5m])) 29 * 100 > %(grpcErrorThreshold)s 30 ) 31 ||| % thanos.store, 32 'for': '5m', 33 labels: { 34 severity: 'warning', 35 }, 36 }, 37 { 38 alert: 'ThanosStoreSeriesGateLatencyHigh', 39 annotations: { 40 description: 'Thanos Store {{$labels.job}}%s has a 99th percentile latency of {{$value}} seconds for store series gate requests.' % location, 41 summary: 'Thanos Store has high latency for store series gate requests.', 42 }, 43 expr: ||| 44 ( 45 histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{%(selector)s}[5m]))) > %(seriesGateErrorThreshold)s 46 and 47 sum by (%(dimensions)s) (rate(thanos_bucket_store_series_gate_duration_seconds_count{%(selector)s}[5m])) > 0 48 ) 49 ||| % thanos.store, 50 'for': '10m', 51 labels: { 52 severity: 'warning', 53 }, 54 }, 55 { 56 alert: 'ThanosStoreBucketHighOperationFailures', 57 annotations: { 58 description: 'Thanos Store {{$labels.job}}%s Bucket is failing to execute {{$value | humanize}}%% of operations.' % location, 59 summary: 'Thanos Store Bucket is failing to execute operations.', 60 }, 61 expr: ||| 62 ( 63 sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m])) 64 / 65 sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m])) 66 * 100 > %(bucketOpsErrorThreshold)s 67 ) 68 ||| % thanos.store, 69 'for': '15m', 70 labels: { 71 severity: 'warning', 72 }, 73 }, 74 { 75 alert: 'ThanosStoreObjstoreOperationLatencyHigh', 76 annotations: { 77 description: 'Thanos Store {{$labels.job}}%s Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.' % location, 78 summary: 'Thanos Store is having high latency for bucket operations.', 79 }, 80 expr: ||| 81 ( 82 histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{%(selector)s}[5m]))) > %(bucketOpsP99LatencyThreshold)s 83 and 84 sum by (%(dimensions)s) (rate(thanos_objstore_bucket_operation_duration_seconds_count{%(selector)s}[5m])) > 0 85 ) 86 ||| % thanos.store, 87 'for': '10m', 88 labels: { 89 severity: 'warning', 90 }, 91 }, 92 ], 93 }, 94 ], 95 }, 96 }