github.com/thanos-io/thanos@v0.32.5/examples/alerts/rules.yaml (about)

     1  groups:
     2  - name: thanos-query.rules
     3    rules:
     4    - expr: |
     5        (
     6          sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*", grpc_type="unary"}[5m]))
     7        /
     8          sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*", grpc_type="unary"}[5m]))
     9        )
    10      record: :grpc_client_failures_per_unary:sum_rate
    11    - expr: |
    12        (
    13          sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*", grpc_type="server_stream"}[5m]))
    14        /
    15          sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*", grpc_type="server_stream"}[5m]))
    16        )
    17      record: :grpc_client_failures_per_stream:sum_rate
    18    - expr: |
    19        (
    20          sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m]))
    21        /
    22          sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))
    23        )
    24      record: :thanos_query_store_apis_dns_failures_per_lookup:sum_rate
    25    - expr: |
    26        histogram_quantile(0.99,
    27          sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))
    28        )
    29      labels:
    30        quantile: "0.99"
    31      record: :query_duration_seconds:histogram_quantile
    32    - expr: |
    33        histogram_quantile(0.99,
    34          sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))
    35        )
    36      labels:
    37        quantile: "0.99"
    38      record: :api_range_query_duration_seconds:histogram_quantile
    39  - name: thanos-receive.rules
    40    rules:
    41    - expr: |
    42        (
    43          sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-receive.*", grpc_type="unary"}[5m]))
    44        /
    45          sum by (job) (rate(grpc_server_started_total{job=~".*thanos-receive.*", grpc_type="unary"}[5m]))
    46        )
    47      record: :grpc_server_failures_per_unary:sum_rate
    48    - expr: |
    49        (
    50          sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-receive.*", grpc_type="server_stream"}[5m]))
    51        /
    52          sum by (job) (rate(grpc_server_started_total{job=~".*thanos-receive.*", grpc_type="server_stream"}[5m]))
    53        )
    54      record: :grpc_server_failures_per_stream:sum_rate
    55    - expr: |
    56        (
    57          sum by (job) (rate(http_requests_total{handler="receive", job=~".*thanos-receive.*", code!~"5.."}[5m]))
    58        /
    59          sum by (job) (rate(http_requests_total{handler="receive", job=~".*thanos-receive.*"}[5m]))
    60        )
    61      record: :http_failure_per_request:sum_rate
    62    - expr: |
    63        histogram_quantile(0.99,
    64          sum by (job, le) (rate(http_request_duration_seconds_bucket{handler="receive", job=~".*thanos-receive.*"}[5m]))
    65        )
    66      labels:
    67        quantile: "0.99"
    68      record: :http_request_duration_seconds:histogram_quantile
    69    - expr: |
    70        (
    71          sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m]))
    72        /
    73          sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))
    74        )
    75      record: :thanos_receive_replication_failure_per_requests:sum_rate
    76    - expr: |
    77        (
    78          sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))
    79        /
    80          sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))
    81        )
    82      record: :thanos_receive_forward_failure_per_requests:sum_rate
    83    - expr: |
    84        (
    85          sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m]))
    86        /
    87          sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m]))
    88        )
    89      record: :thanos_receive_hashring_file_failure_per_refresh:sum_rate
    90  - name: thanos-store.rules
    91    rules:
    92    - expr: |
    93        (
    94          sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*", grpc_type="unary"}[5m]))
    95        /
    96          sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*", grpc_type="unary"}[5m]))
    97        )
    98      record: :grpc_server_failures_per_unary:sum_rate
    99    - expr: |
   100        (
   101          sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*", grpc_type="server_stream"}[5m]))
   102        /
   103          sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*", grpc_type="server_stream"}[5m]))
   104        )
   105      record: :grpc_server_failures_per_stream:sum_rate
   106    - expr: |
   107        (
   108          sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m]))
   109        /
   110          sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m]))
   111        )
   112      record: :thanos_objstore_bucket_failures_per_operation:sum_rate
   113    - expr: |
   114        histogram_quantile(0.99,
   115          sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))
   116        )
   117      labels:
   118        quantile: "0.99"
   119      record: :thanos_objstore_bucket_operation_duration_seconds:histogram_quantile
   120  - name: thanos-bucket-replicate.rules
   121    rules: []