github.com/thanos-io/thanos@v0.32.5/examples/alerts/rules.yaml (about) 1 groups: 2 - name: thanos-query.rules 3 rules: 4 - expr: | 5 ( 6 sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*", grpc_type="unary"}[5m])) 7 / 8 sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*", grpc_type="unary"}[5m])) 9 ) 10 record: :grpc_client_failures_per_unary:sum_rate 11 - expr: | 12 ( 13 sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*", grpc_type="server_stream"}[5m])) 14 / 15 sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*", grpc_type="server_stream"}[5m])) 16 ) 17 record: :grpc_client_failures_per_stream:sum_rate 18 - expr: | 19 ( 20 sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) 21 / 22 sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) 23 ) 24 record: :thanos_query_store_apis_dns_failures_per_lookup:sum_rate 25 - expr: | 26 histogram_quantile(0.99, 27 sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) 28 ) 29 labels: 30 quantile: "0.99" 31 record: :query_duration_seconds:histogram_quantile 32 - expr: | 33 histogram_quantile(0.99, 34 sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m])) 35 ) 36 labels: 37 quantile: "0.99" 38 record: :api_range_query_duration_seconds:histogram_quantile 39 - name: thanos-receive.rules 40 rules: 41 - expr: | 42 ( 43 sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-receive.*", grpc_type="unary"}[5m])) 44 / 45 sum by (job) (rate(grpc_server_started_total{job=~".*thanos-receive.*", grpc_type="unary"}[5m])) 46 ) 47 record: :grpc_server_failures_per_unary:sum_rate 48 - expr: | 49 ( 50 sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-receive.*", grpc_type="server_stream"}[5m])) 51 / 52 sum by (job) (rate(grpc_server_started_total{job=~".*thanos-receive.*", grpc_type="server_stream"}[5m])) 53 ) 54 record: :grpc_server_failures_per_stream:sum_rate 55 - expr: | 56 ( 57 sum by (job) (rate(http_requests_total{handler="receive", job=~".*thanos-receive.*", code!~"5.."}[5m])) 58 / 59 sum by (job) (rate(http_requests_total{handler="receive", job=~".*thanos-receive.*"}[5m])) 60 ) 61 record: :http_failure_per_request:sum_rate 62 - expr: | 63 histogram_quantile(0.99, 64 sum by (job, le) (rate(http_request_duration_seconds_bucket{handler="receive", job=~".*thanos-receive.*"}[5m])) 65 ) 66 labels: 67 quantile: "0.99" 68 record: :http_request_duration_seconds:histogram_quantile 69 - expr: | 70 ( 71 sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) 72 / 73 sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m])) 74 ) 75 record: :thanos_receive_replication_failure_per_requests:sum_rate 76 - expr: | 77 ( 78 sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m])) 79 / 80 sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) 81 ) 82 record: :thanos_receive_forward_failure_per_requests:sum_rate 83 - expr: | 84 ( 85 sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) 86 / 87 sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) 88 ) 89 record: :thanos_receive_hashring_file_failure_per_refresh:sum_rate 90 - name: thanos-store.rules 91 rules: 92 - expr: | 93 ( 94 sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*", grpc_type="unary"}[5m])) 95 / 96 sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*", grpc_type="unary"}[5m])) 97 ) 98 record: :grpc_server_failures_per_unary:sum_rate 99 - expr: | 100 ( 101 sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*", grpc_type="server_stream"}[5m])) 102 / 103 sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*", grpc_type="server_stream"}[5m])) 104 ) 105 record: :grpc_server_failures_per_stream:sum_rate 106 - expr: | 107 ( 108 sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) 109 / 110 sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) 111 ) 112 record: :thanos_objstore_bucket_failures_per_operation:sum_rate 113 - expr: | 114 histogram_quantile(0.99, 115 sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m])) 116 ) 117 labels: 118 quantile: "0.99" 119 record: :thanos_objstore_bucket_operation_duration_seconds:histogram_quantile 120 - name: thanos-bucket-replicate.rules 121 rules: []