github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/metrics/alertmanager/dm_worker.rules.yml (about) 1 groups: 2 - name: alert.rules 3 rules: 4 - alert: DM_master_all_down 5 expr: up{job="dm_master"} == 0 6 labels: 7 env: ENV_LABELS_ENV 8 level: critical 9 expr: up{job="dm_master"} == 0 10 annotations: 11 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 12 value: '{{ $value }}' 13 summary: DM master all down, metrics not relyable 14 15 - alert: DM_remain_storage_of_relay_log 16 expr: dm_relay_space{type="available"} < 10*1024*1024*1024 17 labels: 18 env: ENV_LABELS_ENV 19 level: critical 20 expr: dm_relay_space{type="available"} < 10*1024*1024*1024 21 annotations: 22 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 23 value: '{{ $value }}' 24 summary: DM remain storage of relay log 25 26 - alert: DM_relay_process_exits_with_error 27 expr: changes(dm_relay_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(instance, job) increase(dm_relay_exit_with_error_count{resumable_err="true"}[2m]) > 3 28 labels: 29 env: ENV_LABELS_ENV 30 level: critical 31 expr: changes(dm_relay_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(instance, job) increase(dm_relay_exit_with_error_count{resumable_err="true"}[2m]) > 3 32 annotations: 33 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 34 value: '{{ $value }}' 35 summary: DM relay process exits with error 36 37 - alert: DM_relay_log_data_corruption 38 expr: changes(dm_relay_data_corruption[1m]) > 0 39 labels: 40 env: ENV_LABELS_ENV 41 level: emergency 42 expr: changes(dm_relay_data_corruption[1m]) > 0 43 annotations: 44 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 45 value: '{{ $value }}' 46 summary: DM relay log data corruption 47 48 - alert: DM_fail_to_read_binlog_from_master 49 expr: changes(dm_relay_read_error_count[1m]) > 0 50 labels: 51 env: ENV_LABELS_ENV 52 level: critical 53 expr: changes(dm_relay_read_error_count[1m]) > 0 54 annotations: 55 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 56 value: '{{ $value }}' 57 summary: DM fail to read binlog from master 58 59 - alert: DM_fail_to_write_relay_log 60 expr: changes(dm_relay_write_error_count[1m]) > 0 61 labels: 62 env: ENV_LABELS_ENV 63 level: critical 64 expr: changes(dm_relay_write_error_count[1m]) > 0 65 annotations: 66 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' 67 value: '{{ $value }}' 68 summary: DM fail to write relay log 69 70 - alert: DM_dump_process_exists_with_error 71 expr: changes(dm_mydumper_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(source_id, task) increase(dm_mydumper_exit_with_error_count{resumable_err="true"}[2m]) > 3 72 labels: 73 env: ENV_LABELS_ENV 74 level: critical 75 expr: changes(dm_mydumper_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(source_id, task) increase(dm_mydumper_exit_with_error_count{resumable_err="true"}[2m]) > 3 76 annotations: 77 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}' 78 value: '{{ $value }}' 79 summary: DM dump process exists with error 80 81 - alert: DM_load_process_exists_with_error 82 expr: changes(dm_loader_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(source_id, task) increase(dm_loader_exit_with_error_count{resumable_err="true"}[2m]) > 3 83 labels: 84 env: ENV_LABELS_ENV 85 level: critical 86 expr: changes(dm_loader_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(source_id, task) increase(dm_loader_exit_with_error_count{resumable_err="true"}[2m]) > 3 87 annotations: 88 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}' 89 value: '{{ $value }}' 90 summary: DM load process exists with error 91 92 - alert: DM_sync_process_exists_with_error 93 expr: changes(dm_syncer_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(source_id, task) increase(dm_syncer_exit_with_error_count{resumable_err="true"}[2m]) > 3 94 labels: 95 env: ENV_LABELS_ENV 96 level: critical 97 expr: changes(dm_syncer_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(source_id, task) increase(dm_syncer_exit_with_error_count{resumable_err="true"}[2m]) > 3 98 annotations: 99 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}' 100 value: '{{ $value }}' 101 summary: DM sync process exists with error 102 103 - alert: DM_task_state 104 expr: dm_worker_task_state == 3 105 for: 20m 106 labels: 107 env: ENV_LABELS_ENV 108 level: critical 109 expr: dm_worker_task_state == 3 110 annotations: 111 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}' 112 value: '{{ $value }}' 113 summary: dm worker paused exceed 20 min 114 115 - alert: DM_binlog_file_gap_between_master_relay 116 expr: dm_relay_binlog_file{node="master"} - ON(instance, job) dm_relay_binlog_file{node="relay"} > 1 117 for: 10m 118 labels: 119 env: ENV_LABELS_ENV 120 level: critical 121 expr: dm_relay_binlog_file{node="master"} - ON(instance, job) dm_relay_binlog_file{node="relay"} > 1 122 annotations: 123 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}' 124 value: '{{ $value }}' 125 summary: dm relay binlog file not catch up master server exceed 10 min 126 127 - alert: DM_binlog_file_gap_between_master_syncer 128 expr: dm_syncer_binlog_file{node="master"} - ON(instance, task, job) dm_syncer_binlog_file{node="syncer"} > 1 129 for: 10m 130 labels: 131 env: ENV_LABELS_ENV 132 level: critical 133 expr: dm_syncer_binlog_file{node="master"} - ON(instance, task, job) dm_syncer_binlog_file{node="syncer"} > 1 134 annotations: 135 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}' 136 value: '{{ $value }}' 137 summary: dm syncer binlog file not catch up master server exceed 10 min 138 139 - alert: DM_binlog_file_gap_between_relay_syncer 140 expr: dm_relay_binlog_file{node="relay"} - ON(instance, job) group_right dm_syncer_binlog_file{node="syncer"} > 1 141 for: 10m 142 labels: 143 env: ENV_LABELS_ENV 144 level: critical 145 expr: dm_relay_binlog_file{node="relay"} - ON(instance, job) group_right dm_syncer_binlog_file{node="syncer"} > 1 146 annotations: 147 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}' 148 value: '{{ $value }}' 149 summary: dm syncer binlog file not catch up relay exceed 10 min 150 151 - alert: DM_worker_offline 152 expr: dm_master_worker_state == 0 153 for: 1h 154 labels: 155 env: ENV_LABELS_ENV 156 level: critical 157 expr: dm_master_worker_state == 0 158 annotations: 159 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}' 160 value: '{{ $value }}' 161 summary: dm worker offline exceed 1h 162 163 - alert: DM_pending_DDL 164 expr: dm_master_ddl_state_number > 0 165 for: 1h 166 labels: 167 env: ENV_LABELS_ENV 168 level: critical 169 expr: dm_master_ddl_state_number > 0 170 annotations: 171 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}' 172 value: '{{ $value }}' 173 summary: DDL pending exceed 1h 174 175 - alert: DM_DDL_error 176 expr: increase(dm_master_shard_ddl_error[1m]) > 0 177 labels: 178 env: ENV_LABELS_ENV 179 level: critical 180 expr: increase(dm_master_shard_ddl_error[1m]) > 0 181 annotations: 182 description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}' 183 value: '{{ $value }}' 184 summary: DDL error happens