go.etcd.io/etcd@v3.3.27+incompatible/Documentation/op-guide/etcd3_alert.rules (about) 1 # general cluster availability 2 3 # alert if another failed member will result in an unavailable cluster 4 ALERT InsufficientMembers 5 IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) 6 FOR 3m 7 LABELS { 8 severity = "critical" 9 } 10 ANNOTATIONS { 11 summary = "etcd cluster insufficient members", 12 description = "If one more etcd member goes down the cluster will be unavailable", 13 } 14 15 # etcd leader alerts 16 # ================== 17 18 # alert if any etcd instance has no leader 19 ALERT NoLeader 20 IF etcd_server_has_leader{job="etcd"} == 0 21 FOR 1m 22 LABELS { 23 severity = "critical" 24 } 25 ANNOTATIONS { 26 summary = "etcd member has no leader", 27 description = "etcd member {{ $labels.instance }} has no leader", 28 } 29 30 # alert if there are lots of leader changes 31 ALERT HighNumberOfLeaderChanges 32 IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 33 LABELS { 34 severity = "warning" 35 } 36 ANNOTATIONS { 37 summary = "a high number of leader changes within the etcd cluster are happening", 38 description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", 39 } 40 41 # gRPC request alerts 42 # =================== 43 44 # alert if more than 1% of gRPC method calls have failed within the last 5 minutes 45 ALERT HighNumberOfFailedGRPCRequests 46 IF 100 * (sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) 47 / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m]))) > 1 48 FOR 10m 49 LABELS { 50 severity = "warning" 51 } 52 ANNOTATIONS { 53 summary = "a high number of gRPC requests are failing", 54 description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", 55 } 56 57 # alert if more than 5% of gRPC method calls have failed within the last 5 minutes 58 ALERT HighNumberOfFailedGRPCRequests 59 IF 100 * (sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) 60 / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m]))) > 5 61 FOR 5m 62 LABELS { 63 severity = "critical" 64 } 65 ANNOTATIONS { 66 summary = "a high number of gRPC requests are failing", 67 description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", 68 } 69 70 # alert if the 99th percentile of gRPC method calls take more than 150ms 71 ALERT GRPCRequestsSlow 72 IF histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15 73 FOR 10m 74 LABELS { 75 severity = "critical" 76 } 77 ANNOTATIONS { 78 summary = "slow gRPC requests", 79 description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow", 80 } 81 82 # file descriptor alerts 83 # ====================== 84 85 instance:fd_utilization = process_open_fds / process_max_fds 86 87 # alert if file descriptors are likely to exhaust within the next 4 hours 88 ALERT FdExhaustionClose 89 IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 90 FOR 10m 91 LABELS { 92 severity = "warning" 93 } 94 ANNOTATIONS { 95 summary = "file descriptors soon exhausted", 96 description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon", 97 } 98 99 # alert if file descriptors are likely to exhaust within the next hour 100 ALERT FdExhaustionClose 101 IF predict_linear(instance:fd_utilization[10m], 3600) > 1 102 FOR 10m 103 LABELS { 104 severity = "critical" 105 } 106 ANNOTATIONS { 107 summary = "file descriptors soon exhausted", 108 description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon", 109 } 110 111 # etcd member communication alerts 112 # ================================ 113 114 # alert if 99th percentile of round trips take 150ms 115 ALERT EtcdMemberCommunicationSlow 116 IF histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15 117 FOR 10m 118 LABELS { 119 severity = "warning" 120 } 121 ANNOTATIONS { 122 summary = "etcd member communication is slow", 123 description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow", 124 } 125 126 # etcd proposal alerts 127 # ==================== 128 129 # alert if there are several failed proposals within an hour 130 ALERT HighNumberOfFailedProposals 131 IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 132 LABELS { 133 severity = "warning" 134 } 135 ANNOTATIONS { 136 summary = "a high number of proposals within the etcd cluster are failing", 137 description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour", 138 } 139 140 # etcd disk io latency alerts 141 # =========================== 142 143 # alert if 99th percentile of fsync durations is higher than 500ms 144 ALERT HighFsyncDurations 145 IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 146 FOR 10m 147 LABELS { 148 severity = "warning" 149 } 150 ANNOTATIONS { 151 summary = "high fsync durations", 152 description = "etcd instance {{ $labels.instance }} fync durations are high", 153 } 154 155 # alert if 99th percentile of commit durations is higher than 250ms 156 ALERT HighCommitDurations 157 IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 158 FOR 10m 159 LABELS { 160 severity = "warning" 161 } 162 ANNOTATIONS { 163 summary = "high commit durations", 164 description = "etcd instance {{ $labels.instance }} commit durations are high", 165 }