github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/monitoring/rules/alerts.rules.yml

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/monitoring/rules/alerts.rules.yml (about)

     1  groups:
     2  - name: rules/alerts.rules
     3    rules:
     4    # Alert for any instance that is unreachable for >5 minutes.
     5    - alert: InstanceDown
     6      expr: up{job="cockroachdb"} == 0
     7      for: 5m
     8      annotations:
     9        description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} has been
    10          down for more than 5 minutes.'
    11        summary: Instance {{ $labels.instance }} down
    12    # Alert for any instance that is unreachable for >15 minutes.
    13    - alert: InstanceDead
    14      expr: up{job="cockroachdb"} == 0
    15      for: 15m
    16      annotations:
    17        description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} has been
    18          down for more than 15 minutes.'
    19        summary: Instance {{ $labels.instance }} dead
    20    # Alert on instance restarts.
    21    - alert: InstanceRestart
    22      expr: resets(sys_uptime{job="cockroachdb"}[10m]) > 0 and resets(sys_uptime{job="cockroachdb"}[10m])
    23        < 5
    24      annotations:
    25        description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
    26          {{ $value }} time(s) in 10m'
    27        summary: Instance {{ $labels.instance }} restarted
    28    # Alert on flapping instances (frequent restarts).
    29    - alert: InstanceFlapping
    30      expr: resets(sys_uptime{job="cockroachdb"}[10m]) > 5
    31      annotations:
    32        description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
    33          {{ $value }} time(s) in 10m'
    34        summary: Instance {{ $labels.instance }} flapping
    35    # Alert on mismatching "up" (from prometheus) vs "liveness" (from cockroach). We do this at the node level.
    36    # This compares per-instance "liveness_livenodes" against the per-cluster count(up == 1).
    37    - alert: LivenessMismatch
    38      expr: (liveness_livenodes{job="cockroachdb"}) != ignoring(instance) group_left()
    39        (count by(cluster, job) (up{job="cockroachdb"} == 1))
    40      for: 5m
    41      labels:
    42        severity: testing
    43      annotations:
    44        description: Prometheus and {{ $labels.instance }} disagree on liveness
    45        summary: Liveness mismatch for {{ $labels.instance }}
    46    # Alert on version mismatch.
    47    # This alert is intentionally loose (30 minutes) to allow for rolling upgrades.
    48    # This may need to be adjusted for large clusters.
    49    - alert: VersionMismatch
    50      expr: count by(cluster) (count_values by(tag, cluster) ("version", build_timestamp{job="cockroachdb"}))
    51        > 1
    52      for: 30m
    53      annotations:
    54        description: Cluster {{ $labels.cluster }} running {{ $value }} different versions
    55        summary: Binary version mismatch on {{ $labels.cluster }}
    56    # Available capacity alerts.
    57    - alert: StoreDiskLow
    58      expr: capacity_available:ratio{job="cockroachdb"} < 0.15
    59      annotations:
    60        summary: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value
    61          }} available disk fraction
    62    - alert: ClusterDiskLow
    63      expr: cluster:capacity_available:ratio{job="cockroachdb"} < 0.2
    64      annotations:
    65        summary: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction
    66    # Zero SQL qps.
    67    - alert: ZeroSQLQps
    68      expr: sql_conns{job="cockroachdb"} > 0 and rate(sql_query_count{job="cockroachdb"}[5m])
    69        == 0
    70      for: 10m
    71      annotations:
    72        summary: Instance {{ $labels.instance }} has SQL connections but no queries
    73    # Unavailable ranges.
    74    - alert: UnavailableRanges
    75      expr: (sum by(instance, cluster) (ranges_unavailable{job="cockroachdb"})) > 0
    76      for: 10m
    77      labels:
    78        severity: testing
    79      annotations:
    80        summary: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges
    81    # Leader-not-leaseholder ranges.
    82    - alert: NoLeaseRanges
    83      expr: (sum by(instance, cluster) (replicas_leaders_not_leaseholders{job="cockroachdb"}))
    84        > 0
    85      for: 10m
    86      labels:
    87        severity: testing
    88      annotations:
    89        summary: Instance {{ $labels.instance }} has {{ $value }} ranges without leases
    90    # Certificate expiration. Alerts are per node.
    91    - alert: CACertificateExpiresSoon
    92      expr: (security_certificate_expiration_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_ca{job="cockroachdb"}
    93        - time()) < 86400 * 366
    94      labels:
    95        frequency: daily
    96      annotations:
    97        summary: CA certificate for {{ $labels.instance }} expires in less than a year
    98    - alert: ClientCACertificateExpiresSoon
    99      expr: (security_certificate_expiration_client_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_client_ca{job="cockroachdb"}
   100        - time()) < 86400 * 366
   101      labels:
   102        frequency: daily
   103      annotations:
   104        summary: Client CA certificate for {{ $labels.instance }} expires in less than a year
   105    - alert: UICACertificateExpiresSoon
   106      expr: (security_certificate_expiration_ui_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_ui_ca{job="cockroachdb"}
   107        - time()) < 86400 * 366
   108      labels:
   109        frequency: daily
   110      annotations:
   111        summary: UI CA certificate for {{ $labels.instance }} expires in less than a year
   112    - alert: NodeCertificateExpiresSoon
   113      expr: (security_certificate_expiration_node{job="cockroachdb"} > 0) and (security_certificate_expiration_node{job="cockroachdb"}
   114        - time()) < 86400 * 183
   115      labels:
   116        frequency: daily
   117      annotations:
   118        summary: Node certificate for {{ $labels.instance }} expires in less than six months
   119    - alert: NodeClientCertificateExpiresSoon
   120      expr: (security_certificate_expiration_node_client{job="cockroachdb"} > 0) and (security_certificate_expiration_node_client{job="cockroachdb"}
   121        - time()) < 86400 * 183
   122      labels:
   123        frequency: daily
   124      annotations:
   125        summary: Client certificate for {{ $labels.instance }} expires in less than six months
   126    - alert: UICertificateExpiresSoon
   127      expr: (security_certificate_expiration_ui{job="cockroachdb"} > 0) and (security_certificate_expiration_ui{job="cockroachdb"}
   128        - time()) < 86400 * 30
   129      labels:
   130        frequency: daily
   131      annotations:
   132        summary: UI certificate for {{ $labels.instance }} expires in less than 30 days
   133    # Getting close to open file descriptor limit.
   134    - alert: HighOpenFDCount
   135      expr: sys_fd_open{job="cockroachdb"} / sys_fd_softlimit{job="cockroachdb"} > 0.8
   136      for: 10m
   137      labels:
   138        severity: testing
   139      annotations:
   140        summary: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value
   141          }} fraction used'