github.com/thanos-io/thanos@v0.32.5/examples/alerts/alerts.yaml (about)

     1  groups:
     2  - name: thanos-compact
     3    rules:
     4    - alert: ThanosCompactMultipleRunning
     5      annotations:
     6        description: No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running.
     7        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactmultiplerunning
     8        summary: Thanos Compact has multiple instances running.
     9      expr: sum by (job) (up{job=~".*thanos-compact.*"}) > 1
    10      for: 5m
    11      labels:
    12        severity: warning
    13    - alert: ThanosCompactHalted
    14      annotations:
    15        description: Thanos Compact {{$labels.job}} has failed to run and now is halted.
    16        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthalted
    17        summary: Thanos Compact has failed to run and is now halted.
    18      expr: thanos_compact_halted{job=~".*thanos-compact.*"} == 1
    19      for: 5m
    20      labels:
    21        severity: warning
    22    - alert: ThanosCompactHighCompactionFailures
    23      annotations:
    24        description: Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions.
    25        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthighcompactionfailures
    26        summary: Thanos Compact is failing to execute compactions.
    27      expr: |
    28        (
    29          sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m]))
    30        /
    31          sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m]))
    32        * 100 > 5
    33        )
    34      for: 15m
    35      labels:
    36        severity: warning
    37    - alert: ThanosCompactBucketHighOperationFailures
    38      annotations:
    39        description: Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.
    40        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactbuckethighoperationfailures
    41        summary: Thanos Compact Bucket is having a high number of operation failures.
    42      expr: |
    43        (
    44          sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m]))
    45        /
    46          sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m]))
    47        * 100 > 5
    48        )
    49      for: 15m
    50      labels:
    51        severity: warning
    52    - alert: ThanosCompactHasNotRun
    53      annotations:
    54        description: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
    55        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthasnotrun
    56        summary: Thanos Compact has not uploaded anything for last 24 hours.
    57      expr: (time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24
    58      labels:
    59        severity: warning
    60  - name: thanos-query
    61    rules:
    62    - alert: ThanosQueryHttpRequestQueryErrorRateHigh
    63      annotations:
    64        description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query" requests.
    65        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryerrorratehigh
    66        summary: Thanos Query is failing to handle requests.
    67      expr: |
    68        (
    69          sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))
    70        /
    71          sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))
    72        ) * 100 > 5
    73      for: 5m
    74      labels:
    75        severity: critical
    76    - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
    77      annotations:
    78        description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query_range" requests.
    79        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryrangeerrorratehigh
    80        summary: Thanos Query is failing to handle requests.
    81      expr: |
    82        (
    83          sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))
    84        /
    85          sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))
    86        ) * 100 > 5
    87      for: 5m
    88      labels:
    89        severity: critical
    90    - alert: ThanosQueryGrpcServerErrorRate
    91      annotations:
    92        description: Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.
    93        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcservererrorrate
    94        summary: Thanos Query is failing to handle requests.
    95      expr: |
    96        (
    97          sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))
    98        /
    99          sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m]))
   100        * 100 > 5
   101        )
   102      for: 5m
   103      labels:
   104        severity: warning
   105    - alert: ThanosQueryGrpcClientErrorRate
   106      annotations:
   107        description: Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.
   108        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcclienterrorrate
   109        summary: Thanos Query is failing to send requests.
   110      expr: |
   111        (
   112          sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m]))
   113        /
   114          sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))
   115        ) * 100 > 5
   116      for: 5m
   117      labels:
   118        severity: warning
   119    - alert: ThanosQueryHighDNSFailures
   120      annotations:
   121        description: Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.
   122        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures
   123        summary: Thanos Query is having high number of DNS failures.
   124      expr: |
   125        (
   126          sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m]))
   127        /
   128          sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))
   129        ) * 100 > 1
   130      for: 15m
   131      labels:
   132        severity: warning
   133    - alert: ThanosQueryInstantLatencyHigh
   134      annotations:
   135        description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.
   136        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh
   137        summary: Thanos Query has high latency for queries.
   138      expr: |
   139        (
   140          histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40
   141        and
   142          sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0
   143        )
   144      for: 10m
   145      labels:
   146        severity: critical
   147    - alert: ThanosQueryRangeLatencyHigh
   148      annotations:
   149        description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.
   150        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh
   151        summary: Thanos Query has high latency for queries.
   152      expr: |
   153        (
   154          histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90
   155        and
   156          sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0
   157        )
   158      for: 10m
   159      labels:
   160        severity: critical
   161    - alert: ThanosQueryOverload
   162      annotations:
   163        description: Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.
   164        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload
   165        summary: Thanos query reaches its maximum capacity serving concurrent requests.
   166      expr: |
   167        (
   168          max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1
   169        )
   170      for: 15m
   171      labels:
   172        severity: warning
   173  - name: thanos-receive
   174    rules:
   175    - alert: ThanosReceiveHttpRequestErrorRateHigh
   176      annotations:
   177        description: Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.
   178        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequesterrorratehigh
   179        summary: Thanos Receive is failing to handle requests.
   180      expr: |
   181        (
   182          sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))
   183        /
   184          sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))
   185        ) * 100 > 5
   186      for: 5m
   187      labels:
   188        severity: critical
   189    - alert: ThanosReceiveHttpRequestLatencyHigh
   190      annotations:
   191        description: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.
   192        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequestlatencyhigh
   193        summary: Thanos Receive has high HTTP requests latency.
   194      expr: |
   195        (
   196          histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10
   197        and
   198          sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0
   199        )
   200      for: 10m
   201      labels:
   202        severity: critical
   203    - alert: ThanosReceiveHighReplicationFailures
   204      annotations:
   205        description: Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.
   206        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighreplicationfailures
   207        summary: Thanos Receive is having high number of replication failures.
   208      expr: |
   209        thanos_receive_replication_factor > 1
   210          and
   211        (
   212          (
   213            sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m]))
   214          /
   215            sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))
   216          )
   217          >
   218          (
   219            max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1) / 2))
   220          /
   221            max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"})
   222          )
   223        ) * 100
   224      for: 5m
   225      labels:
   226        severity: warning
   227    - alert: ThanosReceiveHighForwardRequestFailures
   228      annotations:
   229        description: Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests.
   230        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighforwardrequestfailures
   231        summary: Thanos Receive is failing to forward requests.
   232      expr: |
   233        (
   234          sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))
   235        /
   236          sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))
   237        ) * 100 > 20
   238      for: 5m
   239      labels:
   240        severity: info
   241    - alert: ThanosReceiveHighHashringFileRefreshFailures
   242      annotations:
   243        description: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed.
   244        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures
   245        summary: Thanos Receive is failing to refresh hasring file.
   246      expr: |
   247        (
   248          sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m]))
   249        /
   250          sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m]))
   251        > 0
   252        )
   253      for: 15m
   254      labels:
   255        severity: warning
   256    - alert: ThanosReceiveConfigReloadFailure
   257      annotations:
   258        description: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.
   259        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure
   260        summary: Thanos Receive has not been able to reload configuration.
   261      expr: avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1
   262      for: 5m
   263      labels:
   264        severity: warning
   265    - alert: ThanosReceiveNoUpload
   266      annotations:
   267        description: Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.
   268        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload
   269        summary: Thanos Receive has not uploaded latest data to object storage.
   270      expr: |
   271        (up{job=~".*thanos-receive.*"} - 1)
   272        + on (job, instance) # filters to only alert on current instance last 3h
   273        (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)
   274      for: 3h
   275      labels:
   276        severity: critical
   277    - alert: ThanosReceiveLimitsConfigReloadFailure
   278      annotations:
   279        description: Thanos Receive {{$labels.job}} has not been able to reload the limits configuration.
   280        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivelimitsconfigreloadfailure
   281        summary: Thanos Receive has not been able to reload the limits configuration.
   282      expr: sum by(job) (increase(thanos_receive_limits_config_reload_err_total{job=~".*thanos-receive.*"}[5m])) > 0
   283      for: 5m
   284      labels:
   285        severity: warning
   286    - alert: ThanosReceiveLimitsHighMetaMonitoringQueriesFailureRate
   287      annotations:
   288        description: Thanos Receive {{$labels.job}} is failing for {{$value | humanize}}% of meta monitoring queries.
   289        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivelimitshighmetamonitoringqueriesfailurerate
   290        summary: Thanos Receive has not been able to update the number of head series.
   291      expr: (sum by(job) (increase(thanos_receive_metamonitoring_failed_queries_total{job=~".*thanos-receive.*"}[5m])) / 20) * 100 > 20
   292      for: 5m
   293      labels:
   294        severity: warning
   295    - alert: ThanosReceiveTenantLimitedByHeadSeries
   296      annotations:
   297        description: Thanos Receive tenant {{$labels.tenant}} is limited by head series.
   298        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetenantlimitedbyheadseries
   299        summary: A Thanos Receive tenant is limited by head series.
   300      expr: sum by(job, tenant) (increase(thanos_receive_head_series_limited_requests_total{job=~".*thanos-receive.*"}[5m])) > 0
   301      for: 5m
   302      labels:
   303        severity: warning
   304  - name: thanos-sidecar
   305    rules:
   306    - alert: ThanosSidecarBucketOperationsFailed
   307      annotations:
   308        description: Thanos Sidecar {{$labels.instance}} bucket operations are failing
   309        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed
   310        summary: Thanos Sidecar bucket operations are failing
   311      expr: |
   312        sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0
   313      for: 5m
   314      labels:
   315        severity: critical
   316    - alert: ThanosSidecarNoConnectionToStartedPrometheus
   317      annotations:
   318        description: Thanos Sidecar {{$labels.instance}} is unhealthy.
   319        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarnoconnectiontostartedprometheus
   320        summary: Thanos Sidecar cannot access Prometheus, even though Prometheus seems healthy and has reloaded WAL.
   321      expr: |
   322        thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0
   323        AND on (namespace, pod)
   324        prometheus_tsdb_data_replay_duration_seconds != 0
   325      for: 5m
   326      labels:
   327        severity: critical
   328  - name: thanos-store
   329    rules:
   330    - alert: ThanosStoreGrpcErrorRate
   331      annotations:
   332        description: Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.
   333        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate
   334        summary: Thanos Store is failing to handle gRPC requests.
   335      expr: |
   336        (
   337          sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))
   338        /
   339          sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m]))
   340        * 100 > 5
   341        )
   342      for: 5m
   343      labels:
   344        severity: warning
   345    - alert: ThanosStoreSeriesGateLatencyHigh
   346      annotations:
   347        description: Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests.
   348        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh
   349        summary: Thanos Store has high latency for store series gate requests.
   350      expr: |
   351        (
   352          histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2
   353        and
   354          sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0
   355        )
   356      for: 10m
   357      labels:
   358        severity: warning
   359    - alert: ThanosStoreBucketHighOperationFailures
   360      annotations:
   361        description: Thanos Store {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.
   362        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures
   363        summary: Thanos Store Bucket is failing to execute operations.
   364      expr: |
   365        (
   366          sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m]))
   367        /
   368          sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m]))
   369        * 100 > 5
   370        )
   371      for: 15m
   372      labels:
   373        severity: warning
   374    - alert: ThanosStoreObjstoreOperationLatencyHigh
   375      annotations:
   376        description: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.
   377        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh
   378        summary: Thanos Store is having high latency for bucket operations.
   379      expr: |
   380        (
   381          histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2
   382        and
   383          sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0
   384        )
   385      for: 10m
   386      labels:
   387        severity: warning
   388  - name: thanos-rule
   389    rules:
   390    - alert: ThanosRuleQueueIsDroppingAlerts
   391      annotations:
   392        description: Thanos Rule {{$labels.instance}} is failing to queue alerts.
   393        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeueisdroppingalerts
   394        summary: Thanos Rule is failing to queue alerts.
   395      expr: |
   396        sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0
   397      for: 5m
   398      labels:
   399        severity: critical
   400    - alert: ThanosRuleSenderIsFailingAlerts
   401      annotations:
   402        description: Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.
   403        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulesenderisfailingalerts
   404        summary: Thanos Rule is failing to send alerts to alertmanager.
   405      expr: |
   406        sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0
   407      for: 5m
   408      labels:
   409        severity: critical
   410    - alert: ThanosRuleHighRuleEvaluationFailures
   411      annotations:
   412        description: Thanos Rule {{$labels.instance}} is failing to evaluate rules.
   413        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationfailures
   414        summary: Thanos Rule is failing to evaluate rules.
   415      expr: |
   416        (
   417          sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m]))
   418        /
   419          sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m]))
   420        * 100 > 5
   421        )
   422      for: 5m
   423      labels:
   424        severity: critical
   425    - alert: ThanosRuleHighRuleEvaluationWarnings
   426      annotations:
   427        description: Thanos Rule {{$labels.instance}} has high number of evaluation warnings.
   428        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationwarnings
   429        summary: Thanos Rule has high number of evaluation warnings.
   430      expr: |
   431        sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0
   432      for: 15m
   433      labels:
   434        severity: info
   435    - alert: ThanosRuleRuleEvaluationLatencyHigh
   436      annotations:
   437        description: Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.
   438        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleruleevaluationlatencyhigh
   439        summary: Thanos Rule has high rule evaluation latency.
   440      expr: |
   441        (
   442          sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"})
   443        >
   444          sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})
   445        )
   446      for: 5m
   447      labels:
   448        severity: warning
   449    - alert: ThanosRuleGrpcErrorRate
   450      annotations:
   451        description: Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.
   452        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulegrpcerrorrate
   453        summary: Thanos Rule is failing to handle grpc requests.
   454      expr: |
   455        (
   456          sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))
   457        /
   458          sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m]))
   459        * 100 > 5
   460        )
   461      for: 5m
   462      labels:
   463        severity: warning
   464    - alert: ThanosRuleConfigReloadFailure
   465      annotations:
   466        description: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
   467        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleconfigreloadfailure
   468        summary: Thanos Rule has not been able to reload configuration.
   469      expr: avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1
   470      for: 5m
   471      labels:
   472        severity: info
   473    - alert: ThanosRuleQueryHighDNSFailures
   474      annotations:
   475        description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.
   476        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeryhighdnsfailures
   477        summary: Thanos Rule is having high number of DNS failures.
   478      expr: |
   479        (
   480          sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m]))
   481        /
   482          sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m]))
   483        * 100 > 1
   484        )
   485      for: 15m
   486      labels:
   487        severity: warning
   488    - alert: ThanosRuleAlertmanagerHighDNSFailures
   489      annotations:
   490        description: Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints.
   491        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulealertmanagerhighdnsfailures
   492        summary: Thanos Rule is having high number of DNS failures.
   493      expr: |
   494        (
   495          sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m]))
   496        /
   497          sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m]))
   498        * 100 > 1
   499        )
   500      for: 15m
   501      labels:
   502        severity: warning
   503    - alert: ThanosRuleNoEvaluationFor10Intervals
   504      annotations:
   505        description: Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.
   506        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulenoevaluationfor10intervals
   507        summary: Thanos Rule has rule groups that did not evaluate for 10 intervals.
   508      expr: |
   509        time() -  max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})
   510        >
   511        10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})
   512      for: 5m
   513      labels:
   514        severity: info
   515    - alert: ThanosNoRuleEvaluations
   516      annotations:
   517        description: Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.
   518        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosnoruleevaluations
   519        summary: Thanos Rule did not perform any rule evaluations.
   520      expr: |
   521        sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0
   522          and
   523        sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0
   524      for: 5m
   525      labels:
   526        severity: critical
   527  - name: thanos-bucket-replicate
   528    rules:
   529    - alert: ThanosBucketReplicateErrorRate
   530      annotations:
   531        description: Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed.
   532        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateerrorrate
   533        summary: Thanos Replicate is failing to run.
   534      expr: |
   535        (
   536          sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m]))
   537        / on (job) group_left
   538          sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))
   539        ) * 100 >= 10
   540      for: 5m
   541      labels:
   542        severity: critical
   543    - alert: ThanosBucketReplicateRunLatency
   544      annotations:
   545        description: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.
   546        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicaterunlatency
   547        summary: Thanos Replicate has a high latency for replicate operations.
   548      expr: |
   549        (
   550          histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20
   551        and
   552          sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0
   553        )
   554      for: 5m
   555      labels:
   556        severity: critical
   557  - name: thanos-component-absent
   558    rules:
   559    - alert: ThanosCompactIsDown
   560      annotations:
   561        description: ThanosCompact has disappeared. Prometheus target for the component cannot be discovered.
   562        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactisdown
   563        summary: Thanos component has disappeared.
   564      expr: |
   565        absent(up{job=~".*thanos-compact.*"} == 1)
   566      for: 5m
   567      labels:
   568        severity: critical
   569    - alert: ThanosQueryIsDown
   570      annotations:
   571        description: ThanosQuery has disappeared. Prometheus target for the component cannot be discovered.
   572        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryisdown
   573        summary: Thanos component has disappeared.
   574      expr: |
   575        absent(up{job=~".*thanos-query.*"} == 1)
   576      for: 5m
   577      labels:
   578        severity: critical
   579    - alert: ThanosReceiveIsDown
   580      annotations:
   581        description: ThanosReceive has disappeared. Prometheus target for the component cannot be discovered.
   582        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveisdown
   583        summary: Thanos component has disappeared.
   584      expr: |
   585        absent(up{job=~".*thanos-receive.*"} == 1)
   586      for: 5m
   587      labels:
   588        severity: critical
   589    - alert: ThanosRuleIsDown
   590      annotations:
   591        description: ThanosRule has disappeared. Prometheus target for the component cannot be discovered.
   592        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleisdown
   593        summary: Thanos component has disappeared.
   594      expr: |
   595        absent(up{job=~".*thanos-rule.*"} == 1)
   596      for: 5m
   597      labels:
   598        severity: critical
   599    - alert: ThanosSidecarIsDown
   600      annotations:
   601        description: ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered.
   602        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarisdown
   603        summary: Thanos component has disappeared.
   604      expr: |
   605        absent(up{job=~".*thanos-sidecar.*"} == 1)
   606      for: 5m
   607      labels:
   608        severity: critical
   609    - alert: ThanosStoreIsDown
   610      annotations:
   611        description: ThanosStore has disappeared. Prometheus target for the component cannot be discovered.
   612        runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreisdown
   613        summary: Thanos component has disappeared.
   614      expr: |
   615        absent(up{job=~".*thanos-store.*"} == 1)
   616      for: 5m
   617      labels:
   618        severity: critical