go.etcd.io/etcd@v3.3.27+incompatible/Documentation/op-guide/etcd3_alert.rules.yml (about) 1 # these rules synced manually from https://github.com/etcd-io/etcd/blob/master/Documentation/etcd-mixin/mixin.libsonnet 2 groups: 3 - name: etcd 4 rules: 5 - alert: etcdInsufficientMembers 6 annotations: 7 message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value 8 }}).' 9 expr: | 10 sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2) 11 for: 3m 12 labels: 13 severity: critical 14 - alert: etcdNoLeader 15 annotations: 16 message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has 17 no leader.' 18 expr: | 19 etcd_server_has_leader{job=~".*etcd.*"} == 0 20 for: 1m 21 labels: 22 severity: critical 23 - alert: etcdHighNumberOfLeaderChanges 24 annotations: 25 message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} 26 has seen {{ $value }} leader changes within the last hour.' 27 expr: | 28 rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 29 for: 15m 30 labels: 31 severity: warning 32 - alert: etcdHighNumberOfFailedGRPCRequests 33 annotations: 34 message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ 35 $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' 36 expr: | 37 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) 38 / 39 sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) 40 > 1 41 for: 10m 42 labels: 43 severity: warning 44 - alert: etcdHighNumberOfFailedGRPCRequests 45 annotations: 46 message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ 47 $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' 48 expr: | 49 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) 50 / 51 sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) 52 > 5 53 for: 5m 54 labels: 55 severity: critical 56 - alert: etcdGRPCRequestsSlow 57 annotations: 58 message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method 59 }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.' 60 expr: | 61 histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le)) 62 > 0.15 63 for: 10m 64 labels: 65 severity: critical 66 - alert: etcdMemberCommunicationSlow 67 annotations: 68 message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To 69 }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' 70 expr: | 71 histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) 72 > 0.15 73 for: 10m 74 labels: 75 severity: warning 76 - alert: etcdHighNumberOfFailedProposals 77 annotations: 78 message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within 79 the last hour on etcd instance {{ $labels.instance }}.' 80 expr: | 81 rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 82 for: 15m 83 labels: 84 severity: warning 85 - alert: etcdHighFsyncDurations 86 annotations: 87 message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are 88 {{ $value }}s on etcd instance {{ $labels.instance }}.' 89 expr: | 90 histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) 91 > 0.5 92 for: 10m 93 labels: 94 severity: warning 95 - alert: etcdHighCommitDurations 96 annotations: 97 message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations 98 {{ $value }}s on etcd instance {{ $labels.instance }}.' 99 expr: | 100 histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) 101 > 0.25 102 for: 10m 103 labels: 104 severity: warning 105 - alert: etcdHighNumberOfFailedHTTPRequests 106 annotations: 107 message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd 108 instance {{ $labels.instance }}' 109 expr: | 110 sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) 111 BY (method) > 0.01 112 for: 10m 113 labels: 114 severity: warning 115 - alert: etcdHighNumberOfFailedHTTPRequests 116 annotations: 117 message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd 118 instance {{ $labels.instance }}.' 119 expr: | 120 sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) 121 BY (method) > 0.05 122 for: 10m 123 labels: 124 severity: critical 125 - alert: etcdHTTPRequestsSlow 126 annotations: 127 message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method 128 }} are slow. 129 expr: | 130 histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) 131 > 0.15 132 for: 10m 133 labels: 134 severity: warning