github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/metrics/alertmanager/ticdc.rules.yml (about) 1 groups: 2 - name: alert.rules 3 rules: 4 # server related alter rules 5 - alert: cdc_multiple_owners 6 expr: sum(rate(ticdc_owner_ownership_counter[240s])) >= 0.125 7 for: 1m 8 labels: 9 env: ENV_LABELS_ENV 10 level: warning 11 expr: sum(rate(ticdc_owner_ownership_counter[240s])) >= 0.125 12 annotations: 13 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 14 value: '{{ $value }}' 15 summary: cdc cluster has multiple owners 16 17 - alert: cdc_no_owner 18 expr: sum(rate(ticdc_owner_ownership_counter[240s])) < 0.0625 19 for: 10m 20 labels: 21 env: ENV_LABELS_ENV 22 level: warning 23 expr: sum(rate(ticdc_owner_ownership_counter[240s])) < 0.0625 24 annotations: 25 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 26 value: '{{ $value }}' 27 summary: cdc cluster has no owner for more than 10 minutes 28 29 # changefeed related alter rules 30 - alert: ticdc_changefeed_failed 31 expr: (max_over_time(ticdc_owner_status[1m]) == 2) > 0 32 for: 1m 33 labels: 34 env: ENV_LABELS_ENV 35 level: critical 36 expr: (max_over_time(ticdc_owner_status[1m]) == 2) > 0 37 annotations: 38 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 39 value: '{{ $value }}' 40 summary: cdc changefeed failed, it can not be automatically resumed 41 42 - alert: cdc_checkpoint_high_delay 43 expr: ticdc_owner_checkpoint_ts_lag > 600 44 for: 1m 45 labels: 46 env: ENV_LABELS_ENV 47 level: critical 48 expr: ticdc_owner_checkpoint_ts_lag > 600 49 annotations: 50 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 51 value: '{{ $value }}' 52 summary: cdc owner checkpoint delay more than 10 minutes 53 54 - alert: cdc_resolvedts_high_delay 55 expr: ticdc_owner_resolved_ts_lag > 300 56 for: 1m 57 labels: 58 env: ENV_LABELS_ENV 59 level: critical 60 expr: ticdc_owner_resolved_ts_lag > 300 61 annotations: 62 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 63 value: '{{ $value }}' 64 summary: cdc owner resolved ts delay more than 5 minutes 65 66 - alert: ticdc_sink_execution_error 67 expr: changes(ticdc_sink_execution_error[1m]) > 0 68 for: 1m 69 labels: 70 env: ENV_LABELS_ENV 71 level: warning 72 expr: changes(ticdc_sink_execution_error[1m]) > 0 73 annotations: 74 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 75 value: '{{ $value }}' 76 summary: cdc sink execution meets errors 77 78 - alert: ticdc_processor_exit_with_error_count 79 expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0 80 for: 1m 81 labels: 82 env: ENV_LABELS_ENV 83 level: warning 84 expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0 85 annotations: 86 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 87 value: '{{ $value }}' 88 summary: cdc processor exits with error 89 90 - alert: ticdc_changefeed_meet_error 91 expr: (max_over_time(ticdc_owner_status[1m]) == 1) > 0 92 for: 1m 93 labels: 94 env: ENV_LABELS_ENV 95 level: warning 96 expr: (max_over_time(ticdc_owner_status[1m]) == 1) > 0 97 annotations: 98 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 99 value: '{{ $value }}' 100 summary: cdc changefeed meet error 101 102 # tikv related alter rules 103 - alert: tikv_cdc_min_resolved_ts_no_change_for_1m 104 expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0 and ON (instance) tikv_cdc_captured_region_total > 0 105 for: 1m 106 labels: 107 env: ENV_LABELS_ENV 108 level: warning 109 expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0 110 annotations: 111 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 112 value: '{{ $labels.instance }}' 113 summary: tikv cdc min resolved ts no change for 1m 114 115 - alert: tikv_cdc_scan_duration_seconds_more_than_10min 116 expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600 117 for: 1m 118 labels: 119 env: ENV_LABELS_ENV 120 level: warning 121 expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600 122 annotations: 123 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 124 value: '{{ $value }}' 125 summary: tikv cdc scan duration seconds more than 10 min 126 127 - alert: ticdc_memory_abnormal 128 expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10 129 for: 1m 130 labels: 131 env: ENV_LABELS_ENV 132 level: warning 133 expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10 134 annotations: 135 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' 136 value: '{{ $value }}' 137 summary: TiCDC heap memory usage is over 10 GB