github.com/pingcap/ticdc@v0.0.0-20220526033649-485a10ef2652/metrics/alertmanager/ticdc.rules.yml (about) 1 groups: 2 - name: alert.rules 3 rules: 4 - alert: cdc_multiple_owners 5 expr: sum(rate(ticdc_owner_ownership_counter[30s])) >= 2 6 for: 1m 7 labels: 8 env: ENV_LABELS_ENV 9 level: warning 10 expr: sum(rate(ticdc_owner_ownership_counter[30s])) >= 2 11 annotations: 12 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 13 value: '{{ $value }}' 14 summary: cdc cluster has multiple owners 15 16 - alert: cdc_checkpoint_high_delay 17 expr: (time() - ticdc_processor_checkpoint_ts / 1000) > 600 18 for: 1m 19 labels: 20 env: ENV_LABELS_ENV 21 level: critical 22 expr: (time() - ticdc_processor_checkpoint_ts / 1000) > 600 23 annotations: 24 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 25 value: '{{ $value }}' 26 summary: cdc processor checkpoint delay more than 10 minutes 27 28 - alert: cdc_resolvedts_high_delay 29 expr: (time() - ticdc_processor_resolved_ts / 1000) > 300 30 for: 1m 31 labels: 32 env: ENV_LABELS_ENV 33 level: critical 34 expr: (time() - ticdc_processor_resolved_ts / 1000) > 300 35 annotations: 36 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 37 value: '{{ $value }}' 38 summary: cdc processor resolved ts delay more than 5 minutes 39 40 - alert: ticdc_puller_entry_sorter_sort_duration_time_more_than_2s 41 expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_sort_bucket[1m])) > 2 42 for: 1m 43 labels: 44 env: ENV_LABELS_ENV 45 level: warning 46 expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_sort_bucket[1m])) > 2 47 annotations: 48 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 49 value: '{{ $value }}' 50 summary: ticdc_puller_entry_sorter sort duration time more than 2s 51 52 - alert: ticdc_puller_entry_sorter_merge_duration_time_more_than_2s 53 expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_merge_bucket[1m])) > 2 54 for: 1m 55 labels: 56 env: ENV_LABELS_ENV 57 level: warning 58 expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_merge_bucket[1m])) > 2 59 annotations: 60 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 61 value: '{{ $value }}' 62 summary: ticdc_puller_entry_sorter merge duration time more than 2s 63 64 - alert: ticdc_mounter_unmarshal_and_mount_time_more_than_1s 65 expr: histogram_quantile(0.9, rate(ticdc_mounter_unmarshal_and_mount_bucket[1m])) * 1000 > 1000 66 for: 1m 67 labels: 68 env: ENV_LABELS_ENV 69 level: warning 70 expr: histogram_quantile(0.9, rate(ticdc_mounter_unmarshal_and_mount_bucket[1m])) * 1000 > 1000 71 annotations: 72 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 73 value: '{{ $value }}' 74 summary: cdc_mounter unmarshal and mount time more than 1s 75 76 - alert: cdc_sink_execute_duration_time_more_than_10s 77 expr: histogram_quantile(0.9, rate(ticdc_sink_txn_exec_duration_bucket[1m])) > 10 78 for: 1m 79 labels: 80 env: ENV_LABELS_ENV 81 level: warning 82 expr: histogram_quantile(0.9, rate(ticdc_sink_txn_exec_duration_bucket[1m])) > 10 83 annotations: 84 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 85 value: '{{ $value }}' 86 summary: cdc sink execute_duration_time_more_than_10s 87 88 - alert: cdc_processor_checkpoint_tso_no_change_for_1m 89 expr: changes(ticdc_processor_checkpoint_ts[1m]) < 1 90 labels: 91 env: ENV_LABELS_ENV 92 level: warning 93 expr: changes(ticdc_processor_checkpoint_ts[1m]) < 1 94 annotations: 95 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 96 value: '{{ $value }}' 97 summary: cdc processor checkpoint tso no change for 1m 98 99 - alert: ticdc_puller_entry_sorter_sort_bucket 100 expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_sort_bucket{}[1m])) > 1 101 for: 1m 102 labels: 103 env: ENV_LABELS_ENV 104 level: warning 105 expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_sort_bucket{}[1m])) 106 annotations: 107 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 108 value: '{{ $value }}' 109 summary: ticdc puller entry sorter sort latency is too high 110 111 - alert: ticdc_puller_entry_sorter_merge_bucket 112 expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_merge_bucket{}[1m])) > 1 113 for: 1m 114 labels: 115 env: ENV_LABELS_ENV 116 level: warning 117 expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_merge_bucket{}[1m])) 118 annotations: 119 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 120 value: '{{ $value }}' 121 summary: ticdc puller entry sorter merge latency is too high 122 123 - alert: tikv_cdc_min_resolved_ts_no_change_for_1m 124 expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 125 for: 1m 126 labels: 127 env: ENV_LABELS_ENV 128 level: warning 129 expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 130 annotations: 131 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 132 value: '{{ $value }}' 133 summary: tikv cdc min resolved ts no change for 1m 134 135 - alert: tikv_cdc_scan_duration_seconds_more_than_30s 136 expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 30 137 for: 1m 138 labels: 139 env: ENV_LABELS_ENV 140 level: warning 141 expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 30 142 annotations: 143 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 144 value: '{{ $value }}' 145 summary: tikv cdc scan duration seconds more than 30s 146 147 - alert: ticdc_sink_mysql_execution_error 148 expr: changes(ticdc_sink_mysql_execution_error[1m]) > 0 149 for: 1m 150 labels: 151 env: ENV_LABELS_ENV 152 level: warning 153 expr: changes(ticdc_sink_mysql_execution_error[1m]) > 0 154 annotations: 155 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 156 value: '{{ $value }}' 157 summary: cdc sink mysql execution meets errors 158 159 - alert: ticdc_processor_exit_with_error_count 160 expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0 161 for: 1m 162 labels: 163 env: ENV_LABELS_ENV 164 level: critical 165 expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0 166 annotations: 167 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 168 value: '{{ $value }}' 169 summary: cdc processor exits with error 170 171 - alert: ticdc_memory_abnormal 172 expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10 173 for: 1m 174 labels: 175 env: ENV_LABELS_ENV 176 level: warning 177 expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10 178 annotations: 179 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' 180 value: '{{ $value }}' 181 summary: TiCDC heap memory usage is over 10 GB 182 183 - alert: tikv_enabled_hibernate_regions 184 expr: sum(tikv_config_raftstore{name="hibernate_regions"}) > 0 185 for: 1m 186 labels: 187 env: ENV_LABELS_ENV 188 level: warning 189 expr: sum(tikv_config_raftstore{name="hibernate_regions"}) > 0 190 annotations: 191 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 192 value: '{{ $value }}' 193 summary: cdc will break tikv hibernate regions