github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/cloud/kubernetes/prometheus/alert-rules.yaml (about)

     1  # GENERATED FILE - DO NOT EDIT
     2  apiVersion: monitoring.coreos.com/v1
     3  kind: PrometheusRule
     4  metadata:
     5    labels:
     6      app: cockroachdb
     7      prometheus: cockroachdb
     8      role: alert-rules
     9    name: prometheus-cockroachdb-rules
    10  spec:
    11    groups:
    12    - name: rules/dummy.rules
    13      rules:
    14      - alert: TestAlertManager
    15        expr: vector(1)
    16    - name: rules/aggregation.rules
    17      rules:
    18      - expr: sum without(store) (capacity{job="cockroachdb"})
    19        record: node:capacity
    20      - expr: sum without(instance) (node:capacity{job="cockroachdb"})
    21        record: cluster:capacity
    22      - expr: sum without(store) (capacity_available{job="cockroachdb"})
    23        record: node:capacity_available
    24      - expr: sum without(instance) (node:capacity_available{job="cockroachdb"})
    25        record: cluster:capacity_available
    26      - expr: capacity_available{job="cockroachdb"} / capacity{job="cockroachdb"}
    27        record: capacity_available:ratio
    28      - expr: node:capacity_available{job="cockroachdb"} / node:capacity{job="cockroachdb"}
    29        record: node:capacity_available:ratio
    30      - expr: cluster:capacity_available{job="cockroachdb"} / cluster:capacity{job="cockroachdb"}
    31        record: cluster:capacity_available:ratio
    32      - expr: rate(txn_durations_bucket{job="cockroachdb"}[1m])
    33        record: txn_durations_bucket:rate1m
    34      - expr: histogram_quantile(0.5, txn_durations_bucket:rate1m)
    35        record: txn_durations:rate1m:quantile_50
    36      - expr: histogram_quantile(0.75, txn_durations_bucket:rate1m)
    37        record: txn_durations:rate1m:quantile_75
    38      - expr: histogram_quantile(0.9, txn_durations_bucket:rate1m)
    39        record: txn_durations:rate1m:quantile_90
    40      - expr: histogram_quantile(0.95, txn_durations_bucket:rate1m)
    41        record: txn_durations:rate1m:quantile_95
    42      - expr: histogram_quantile(0.99, txn_durations_bucket:rate1m)
    43        record: txn_durations:rate1m:quantile_99
    44      - expr: rate(exec_latency_bucket{job="cockroachdb"}[1m])
    45        record: exec_latency_bucket:rate1m
    46      - expr: histogram_quantile(0.5, exec_latency_bucket:rate1m)
    47        record: exec_latency:rate1m:quantile_50
    48      - expr: histogram_quantile(0.75, exec_latency_bucket:rate1m)
    49        record: exec_latency:rate1m:quantile_75
    50      - expr: histogram_quantile(0.9, exec_latency_bucket:rate1m)
    51        record: exec_latency:rate1m:quantile_90
    52      - expr: histogram_quantile(0.95, exec_latency_bucket:rate1m)
    53        record: exec_latency:rate1m:quantile_95
    54      - expr: histogram_quantile(0.99, exec_latency_bucket:rate1m)
    55        record: exec_latency:rate1m:quantile_99
    56      - expr: rate(round_trip_latency_bucket{job="cockroachdb"}[1m])
    57        record: round_trip_latency_bucket:rate1m
    58      - expr: histogram_quantile(0.5, round_trip_latency_bucket:rate1m)
    59        record: round_trip_latency:rate1m:quantile_50
    60      - expr: histogram_quantile(0.75, round_trip_latency_bucket:rate1m)
    61        record: round_trip_latency:rate1m:quantile_75
    62      - expr: histogram_quantile(0.9, round_trip_latency_bucket:rate1m)
    63        record: round_trip_latency:rate1m:quantile_90
    64      - expr: histogram_quantile(0.95, round_trip_latency_bucket:rate1m)
    65        record: round_trip_latency:rate1m:quantile_95
    66      - expr: histogram_quantile(0.99, round_trip_latency_bucket:rate1m)
    67        record: round_trip_latency:rate1m:quantile_99
    68      - expr: rate(sql_exec_latency_bucket{job="cockroachdb"}[1m])
    69        record: sql_exec_latency_bucket:rate1m
    70      - expr: histogram_quantile(0.5, sql_exec_latency_bucket:rate1m)
    71        record: sql_exec_latency:rate1m:quantile_50
    72      - expr: histogram_quantile(0.75, sql_exec_latency_bucket:rate1m)
    73        record: sql_exec_latency:rate1m:quantile_75
    74      - expr: histogram_quantile(0.9, sql_exec_latency_bucket:rate1m)
    75        record: sql_exec_latency:rate1m:quantile_90
    76      - expr: histogram_quantile(0.95, sql_exec_latency_bucket:rate1m)
    77        record: sql_exec_latency:rate1m:quantile_95
    78      - expr: histogram_quantile(0.99, sql_exec_latency_bucket:rate1m)
    79        record: sql_exec_latency:rate1m:quantile_99
    80      - expr: rate(raft_process_logcommit_latency_bucket{job="cockroachdb"}[1m])
    81        record: raft_process_logcommit_latency_bucket:rate1m
    82      - expr: histogram_quantile(0.5, raft_process_logcommit_latency_bucket:rate1m)
    83        record: raft_process_logcommit_latency:rate1m:quantile_50
    84      - expr: histogram_quantile(0.75, raft_process_logcommit_latency_bucket:rate1m)
    85        record: raft_process_logcommit_latency:rate1m:quantile_75
    86      - expr: histogram_quantile(0.9, raft_process_logcommit_latency_bucket:rate1m)
    87        record: raft_process_logcommit_latency:rate1m:quantile_90
    88      - expr: histogram_quantile(0.95, raft_process_logcommit_latency_bucket:rate1m)
    89        record: raft_process_logcommit_latency:rate1m:quantile_95
    90      - expr: histogram_quantile(0.99, raft_process_logcommit_latency_bucket:rate1m)
    91        record: raft_process_logcommit_latency:rate1m:quantile_99
    92      - expr: rate(raft_process_commandcommit_latency_bucket{job="cockroachdb"}[1m])
    93        record: raft_process_commandcommit_latency_bucket:rate1m
    94      - expr: histogram_quantile(0.5, raft_process_commandcommit_latency_bucket:rate1m)
    95        record: raft_process_commandcommit_latency:rate1m:quantile_50
    96      - expr: histogram_quantile(0.75, raft_process_commandcommit_latency_bucket:rate1m)
    97        record: raft_process_commandcommit_latency:rate1m:quantile_75
    98      - expr: histogram_quantile(0.9, raft_process_commandcommit_latency_bucket:rate1m)
    99        record: raft_process_commandcommit_latency:rate1m:quantile_90
   100      - expr: histogram_quantile(0.95, raft_process_commandcommit_latency_bucket:rate1m)
   101        record: raft_process_commandcommit_latency:rate1m:quantile_95
   102      - expr: histogram_quantile(0.99, raft_process_commandcommit_latency_bucket:rate1m)
   103        record: raft_process_commandcommit_latency:rate1m:quantile_99
   104    - name: rules/alerts.rules
   105      rules:
   106      - alert: InstanceDown
   107        annotations:
   108          description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} has
   109            been down for more than 5 minutes.'
   110          summary: Instance {{ $labels.instance }} down
   111        expr: up{job="cockroachdb"} == 0
   112        for: 5m
   113      - alert: InstanceDead
   114        annotations:
   115          description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} has
   116            been down for more than 15 minutes.'
   117          summary: Instance {{ $labels.instance }} dead
   118        expr: up{job="cockroachdb"} == 0
   119        for: 15m
   120      - alert: InstanceRestart
   121        annotations:
   122          description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
   123            {{ $value }} time(s) in 10m'
   124          summary: Instance {{ $labels.instance }} restarted
   125        expr: resets(sys_uptime{job="cockroachdb"}[10m]) > 0 and resets(sys_uptime{job="cockroachdb"}[10m])
   126          < 5
   127      - alert: InstanceFlapping
   128        annotations:
   129          description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
   130            {{ $value }} time(s) in 10m'
   131          summary: Instance {{ $labels.instance }} flapping
   132        expr: resets(sys_uptime{job="cockroachdb"}[10m]) > 5
   133      - alert: LivenessMismatch
   134        annotations:
   135          description: Prometheus and {{ $labels.instance }} disagree on liveness
   136          summary: Liveness mismatch for {{ $labels.instance }}
   137        expr: (liveness_livenodes{job="cockroachdb"}) != ignoring(instance) group_left()
   138          (count by(cluster, job) (up{job="cockroachdb"} == 1))
   139        for: 5m
   140        labels:
   141          severity: testing
   142      - alert: VersionMismatch
   143        annotations:
   144          description: Cluster {{ $labels.cluster }} running {{ $value }} different
   145            versions
   146          summary: Binary version mismatch on {{ $labels.cluster }}
   147        expr: count by(cluster) (count_values by(tag, cluster) ("version", build_timestamp{job="cockroachdb"}))
   148          > 1
   149        for: 30m
   150      - alert: StoreDiskLow
   151        annotations:
   152          summary: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value
   153            }} available disk fraction
   154        expr: capacity_available:ratio{job="cockroachdb"} < 0.15
   155      - alert: ClusterDiskLow
   156        annotations:
   157          summary: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction
   158        expr: cluster:capacity_available:ratio{job="cockroachdb"} < 0.2
   159      - alert: ZeroSQLQps
   160        annotations:
   161          summary: Instance {{ $labels.instance }} has SQL connections but no queries
   162        expr: sql_conns{job="cockroachdb"} > 0 and rate(sql_query_count{job="cockroachdb"}[5m])
   163          == 0
   164        for: 10m
   165      - alert: UnavailableRanges
   166        annotations:
   167          summary: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges
   168        expr: (sum by(instance, cluster) (ranges_unavailable{job="cockroachdb"})) >
   169          0
   170        for: 10m
   171        labels:
   172          severity: testing
   173      - alert: NoLeaseRanges
   174        annotations:
   175          summary: Instance {{ $labels.instance }} has {{ $value }} ranges without leases
   176        expr: (sum by(instance, cluster) (replicas_leaders_not_leaseholders{job="cockroachdb"}))
   177          > 0
   178        for: 10m
   179        labels:
   180          severity: testing
   181      - alert: CACertificateExpiresSoon
   182        annotations:
   183          summary: CA certificate for {{ $labels.instance }} expires in less than a
   184            year
   185        expr: (security_certificate_expiration_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_ca{job="cockroachdb"}
   186          - time()) < 86400 * 366
   187        labels:
   188          frequency: daily
   189      - alert: NodeCertificateExpiresSoon
   190        annotations:
   191          summary: Node certificate for {{ $labels.instance }} expires in less than
   192            six months
   193        expr: (security_certificate_expiration_node{job="cockroachdb"} > 0) and (security_certificate_expiration_node{job="cockroachdb"}
   194          - time()) < 86400 * 183
   195        labels:
   196          frequency: daily
   197      - alert: HighOpenFDCount
   198        annotations:
   199          summary: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value
   200            }} fraction used'
   201        expr: sys_fd_open{job="cockroachdb"} / sys_fd_softlimit{job="cockroachdb"} >
   202          0.8
   203        for: 10m
   204        labels:
   205          severity: testing