go.etcd.io/etcd@v3.3.27+incompatible/Documentation/op-guide/etcd3_alert.rules.yml (about)

     1  # these rules synced manually from https://github.com/etcd-io/etcd/blob/master/Documentation/etcd-mixin/mixin.libsonnet
     2  groups:
     3  - name: etcd
     4    rules:
     5    - alert: etcdInsufficientMembers
     6      annotations:
     7        message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
     8          }}).'
     9      expr: |
    10        sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
    11      for: 3m
    12      labels:
    13        severity: critical
    14    - alert: etcdNoLeader
    15      annotations:
    16        message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has
    17          no leader.'
    18      expr: |
    19        etcd_server_has_leader{job=~".*etcd.*"} == 0
    20      for: 1m
    21      labels:
    22        severity: critical
    23    - alert: etcdHighNumberOfLeaderChanges
    24      annotations:
    25        message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }}
    26          has seen {{ $value }} leader changes within the last hour.'
    27      expr: |
    28        rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
    29      for: 15m
    30      labels:
    31        severity: warning
    32    - alert: etcdHighNumberOfFailedGRPCRequests
    33      annotations:
    34        message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
    35          $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
    36      expr: |
    37        100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
    38          /
    39        sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
    40          > 1
    41      for: 10m
    42      labels:
    43        severity: warning
    44    - alert: etcdHighNumberOfFailedGRPCRequests
    45      annotations:
    46        message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
    47          $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
    48      expr: |
    49        100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
    50          /
    51        sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
    52          > 5
    53      for: 5m
    54      labels:
    55        severity: critical
    56    - alert: etcdGRPCRequestsSlow
    57      annotations:
    58        message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
    59          }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
    60      expr: |
    61        histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
    62        > 0.15
    63      for: 10m
    64      labels:
    65        severity: critical
    66    - alert: etcdMemberCommunicationSlow
    67      annotations:
    68        message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
    69          }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
    70      expr: |
    71        histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
    72        > 0.15
    73      for: 10m
    74      labels:
    75        severity: warning
    76    - alert: etcdHighNumberOfFailedProposals
    77      annotations:
    78        message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within
    79          the last hour on etcd instance {{ $labels.instance }}.'
    80      expr: |
    81        rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
    82      for: 15m
    83      labels:
    84        severity: warning
    85    - alert: etcdHighFsyncDurations
    86      annotations:
    87        message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are
    88          {{ $value }}s on etcd instance {{ $labels.instance }}.'
    89      expr: |
    90        histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
    91        > 0.5
    92      for: 10m
    93      labels:
    94        severity: warning
    95    - alert: etcdHighCommitDurations
    96      annotations:
    97        message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
    98          {{ $value }}s on etcd instance {{ $labels.instance }}.'
    99      expr: |
   100        histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
   101        > 0.25
   102      for: 10m
   103      labels:
   104        severity: warning
   105    - alert: etcdHighNumberOfFailedHTTPRequests
   106      annotations:
   107        message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
   108          instance {{ $labels.instance }}'
   109      expr: |
   110        sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
   111        BY (method) > 0.01
   112      for: 10m
   113      labels:
   114        severity: warning
   115    - alert: etcdHighNumberOfFailedHTTPRequests
   116      annotations:
   117        message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
   118          instance {{ $labels.instance }}.'
   119      expr: |
   120        sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
   121        BY (method) > 0.05
   122      for: 10m
   123      labels:
   124        severity: critical
   125    - alert: etcdHTTPRequestsSlow
   126      annotations:
   127        message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
   128          }} are slow.
   129      expr: |
   130        histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
   131        > 0.15
   132      for: 10m
   133      labels:
   134        severity: warning