go.etcd.io/etcd@v3.3.27+incompatible/Documentation/op-guide/etcd3_alert.rules (about)

     1  # general cluster availability
     2  
     3  # alert if another failed member will result in an unavailable cluster
     4  ALERT InsufficientMembers
     5  IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
     6  FOR 3m
     7  LABELS {
     8    severity = "critical"
     9  }
    10  ANNOTATIONS {
    11    summary = "etcd cluster insufficient members",
    12    description = "If one more etcd member goes down the cluster will be unavailable",
    13  }
    14  
    15  # etcd leader alerts
    16  # ==================
    17  
    18  # alert if any etcd instance has no leader
    19  ALERT NoLeader
    20  IF etcd_server_has_leader{job="etcd"} == 0
    21  FOR 1m
    22  LABELS {
    23    severity = "critical"
    24  }
    25  ANNOTATIONS {
    26    summary = "etcd member has no leader",
    27    description = "etcd member {{ $labels.instance }} has no leader",
    28  }
    29  
    30  # alert if there are lots of leader changes
    31  ALERT HighNumberOfLeaderChanges
    32  IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
    33  LABELS {
    34    severity = "warning"
    35  }
    36  ANNOTATIONS {
    37    summary = "a high number of leader changes within the etcd cluster are happening",
    38    description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
    39  }
    40  
    41  # gRPC request alerts
    42  # ===================
    43  
    44  # alert if more than 1% of gRPC method calls have failed within the last 5 minutes
    45  ALERT HighNumberOfFailedGRPCRequests
    46  IF 100 * (sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
    47    / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m]))) > 1
    48  FOR 10m
    49  LABELS {
    50    severity = "warning"
    51  }
    52  ANNOTATIONS {
    53    summary = "a high number of gRPC requests are failing",
    54    description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
    55  }
    56  
    57  # alert if more than 5% of gRPC method calls have failed within the last 5 minutes
    58  ALERT HighNumberOfFailedGRPCRequests
    59  IF 100 * (sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
    60    / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m]))) > 5
    61  FOR 5m
    62  LABELS {
    63    severity = "critical"
    64  }
    65  ANNOTATIONS {
    66    summary = "a high number of gRPC requests are failing",
    67    description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
    68  }
    69  
    70  # alert if the 99th percentile of gRPC method calls take more than 150ms
    71  ALERT GRPCRequestsSlow
    72  IF histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15
    73  FOR 10m
    74  LABELS {
    75    severity = "critical"
    76  }
    77  ANNOTATIONS {
    78    summary = "slow gRPC requests",
    79    description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow",
    80  }
    81  
    82  # file descriptor alerts
    83  # ======================
    84  
    85  instance:fd_utilization = process_open_fds / process_max_fds
    86  
    87  # alert if file descriptors are likely to exhaust within the next 4 hours
    88  ALERT FdExhaustionClose
    89  IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
    90  FOR 10m
    91  LABELS {
    92    severity = "warning"
    93  }
    94  ANNOTATIONS {
    95    summary = "file descriptors soon exhausted",
    96    description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon",
    97  }
    98  
    99  # alert if file descriptors are likely to exhaust within the next hour
   100  ALERT FdExhaustionClose
   101  IF predict_linear(instance:fd_utilization[10m], 3600) > 1
   102  FOR 10m
   103  LABELS {
   104    severity = "critical"
   105  }
   106  ANNOTATIONS {
   107    summary = "file descriptors soon exhausted",
   108    description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon",
   109  }
   110  
   111  # etcd member communication alerts
   112  # ================================
   113  
   114  # alert if 99th percentile of round trips take 150ms
   115  ALERT EtcdMemberCommunicationSlow
   116  IF histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15
   117  FOR 10m
   118  LABELS {
   119    severity = "warning"
   120  }
   121  ANNOTATIONS {
   122    summary = "etcd member communication is slow",
   123    description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow",
   124  }
   125  
   126  # etcd proposal alerts
   127  # ====================
   128  
   129  # alert if there are several failed proposals within an hour
   130  ALERT HighNumberOfFailedProposals
   131  IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
   132  LABELS {
   133    severity = "warning"
   134  }
   135  ANNOTATIONS {
   136    summary = "a high number of proposals within the etcd cluster are failing",
   137    description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
   138  }
   139  
   140  # etcd disk io latency alerts
   141  # ===========================
   142  
   143  # alert if 99th percentile of fsync durations is higher than 500ms
   144  ALERT HighFsyncDurations
   145  IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
   146  FOR 10m
   147  LABELS {
   148    severity = "warning"
   149  }
   150  ANNOTATIONS {
   151    summary = "high fsync durations",
   152    description = "etcd instance {{ $labels.instance }} fync durations are high",
   153  }
   154  
   155  # alert if 99th percentile of commit durations is higher than 250ms
   156  ALERT HighCommitDurations
   157  IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
   158  FOR 10m
   159  LABELS {
   160    severity = "warning"
   161  }
   162  ANNOTATIONS {
   163    summary = "high commit durations",
   164    description = "etcd instance {{ $labels.instance }} commit durations are high",
   165  }