github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/metrics/alertmanager/dm_worker.rules.yml (about)

     1  groups:
     2  - name: alert.rules
     3    rules:
     4    - alert: DM_master_all_down
     5      expr: up{job="dm_master"} == 0
     6      labels:
     7        env: ENV_LABELS_ENV
     8        level: critical
     9        expr: up{job="dm_master"} == 0
    10      annotations:
    11        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    12        value: '{{ $value }}'
    13        summary: DM master all down, metrics not relyable
    14  
    15    - alert: DM_remain_storage_of_relay_log
    16      expr: dm_relay_space{type="available"} < 10*1024*1024*1024
    17      labels:
    18        env: ENV_LABELS_ENV
    19        level: critical
    20        expr: dm_relay_space{type="available"} < 10*1024*1024*1024
    21      annotations:
    22        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    23        value: '{{ $value }}'
    24        summary: DM remain storage of relay log
    25  
    26    - alert: DM_relay_process_exits_with_error
    27      expr: changes(dm_relay_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(instance, job) increase(dm_relay_exit_with_error_count{resumable_err="true"}[2m]) > 3
    28      labels:
    29        env: ENV_LABELS_ENV
    30        level: critical
    31        expr: changes(dm_relay_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(instance, job) increase(dm_relay_exit_with_error_count{resumable_err="true"}[2m]) > 3
    32      annotations:
    33        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    34        value: '{{ $value }}'
    35        summary: DM relay process exits with error
    36  
    37    - alert: DM_relay_log_data_corruption
    38      expr: changes(dm_relay_data_corruption[1m]) > 0
    39      labels:
    40        env: ENV_LABELS_ENV
    41        level: emergency
    42        expr: changes(dm_relay_data_corruption[1m]) > 0
    43      annotations:
    44        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    45        value: '{{ $value }}'
    46        summary: DM relay log data corruption
    47  
    48    - alert: DM_fail_to_read_binlog_from_master
    49      expr: changes(dm_relay_read_error_count[1m]) > 0
    50      labels:
    51        env: ENV_LABELS_ENV
    52        level: critical
    53        expr: changes(dm_relay_read_error_count[1m]) > 0
    54      annotations:
    55        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    56        value: '{{ $value }}'
    57        summary: DM fail to read binlog from master
    58  
    59    - alert: DM_fail_to_write_relay_log
    60      expr: changes(dm_relay_write_error_count[1m]) > 0
    61      labels:
    62        env: ENV_LABELS_ENV
    63        level: critical
    64        expr: changes(dm_relay_write_error_count[1m]) > 0
    65      annotations:
    66        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    67        value: '{{ $value }}'
    68        summary: DM fail to write relay log
    69  
    70    - alert: DM_dump_process_exists_with_error
    71      expr: changes(dm_mydumper_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(source_id, task) increase(dm_mydumper_exit_with_error_count{resumable_err="true"}[2m]) > 3
    72      labels:
    73        env: ENV_LABELS_ENV
    74        level: critical
    75        expr: changes(dm_mydumper_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(source_id, task) increase(dm_mydumper_exit_with_error_count{resumable_err="true"}[2m]) > 3
    76      annotations:
    77        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
    78        value: '{{ $value }}'
    79        summary: DM dump process exists with error
    80  
    81    - alert: DM_load_process_exists_with_error
    82      expr: changes(dm_loader_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(source_id, task) increase(dm_loader_exit_with_error_count{resumable_err="true"}[2m]) > 3
    83      labels:
    84        env: ENV_LABELS_ENV
    85        level: critical
    86        expr: changes(dm_loader_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(source_id, task) increase(dm_loader_exit_with_error_count{resumable_err="true"}[2m]) > 3
    87      annotations:
    88        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
    89        value: '{{ $value }}'
    90        summary: DM load process exists with error
    91  
    92    - alert: DM_sync_process_exists_with_error
    93      expr: changes(dm_syncer_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(source_id, task) increase(dm_syncer_exit_with_error_count{resumable_err="true"}[2m]) > 3
    94      labels:
    95        env: ENV_LABELS_ENV
    96        level: critical
    97        expr: changes(dm_syncer_exit_with_error_count{resumable_err="false"}[1m]) > 0 or on(source_id, task) increase(dm_syncer_exit_with_error_count{resumable_err="true"}[2m]) > 3
    98      annotations:
    99        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
   100        value: '{{ $value }}'
   101        summary: DM sync process exists with error
   102  
   103    - alert: DM_task_state
   104      expr: dm_worker_task_state == 3
   105      for: 20m
   106      labels:
   107        env: ENV_LABELS_ENV
   108        level: critical
   109        expr: dm_worker_task_state == 3
   110      annotations:
   111        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
   112        value: '{{ $value }}'
   113        summary: dm worker paused exceed 20 min
   114  
   115    - alert: DM_binlog_file_gap_between_master_relay
   116      expr: dm_relay_binlog_file{node="master"} - ON(instance, job) dm_relay_binlog_file{node="relay"} > 1
   117      for: 10m
   118      labels:
   119        env: ENV_LABELS_ENV
   120        level: critical
   121        expr: dm_relay_binlog_file{node="master"} - ON(instance, job) dm_relay_binlog_file{node="relay"} > 1
   122      annotations:
   123        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
   124        value: '{{ $value }}'
   125        summary: dm relay binlog file not catch up master server exceed 10 min
   126  
   127    - alert: DM_binlog_file_gap_between_master_syncer
   128      expr: dm_syncer_binlog_file{node="master"} - ON(instance, task, job) dm_syncer_binlog_file{node="syncer"} > 1
   129      for: 10m
   130      labels:
   131        env: ENV_LABELS_ENV
   132        level: critical
   133        expr: dm_syncer_binlog_file{node="master"} - ON(instance, task, job) dm_syncer_binlog_file{node="syncer"} > 1
   134      annotations:
   135        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
   136        value: '{{ $value }}'
   137        summary: dm syncer binlog file not catch up master server exceed 10 min
   138  
   139    - alert: DM_binlog_file_gap_between_relay_syncer
   140      expr: dm_relay_binlog_file{node="relay"} - ON(instance, job) group_right dm_syncer_binlog_file{node="syncer"} > 1
   141      for: 10m
   142      labels:
   143        env: ENV_LABELS_ENV
   144        level: critical
   145        expr: dm_relay_binlog_file{node="relay"} - ON(instance, job) group_right dm_syncer_binlog_file{node="syncer"} > 1
   146      annotations:
   147        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
   148        value: '{{ $value }}'
   149        summary: dm syncer binlog file not catch up relay exceed 10 min
   150  
   151    - alert: DM_worker_offline
   152      expr: dm_master_worker_state == 0
   153      for: 1h
   154      labels:
   155        env: ENV_LABELS_ENV
   156        level: critical
   157        expr: dm_master_worker_state == 0
   158      annotations:
   159        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
   160        value: '{{ $value }}'
   161        summary: dm worker offline exceed 1h
   162  
   163    - alert: DM_pending_DDL
   164      expr: dm_master_ddl_state_number > 0
   165      for: 1h
   166      labels:
   167        env: ENV_LABELS_ENV
   168        level: critical
   169        expr: dm_master_ddl_state_number > 0
   170      annotations:
   171        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
   172        value: '{{ $value }}'
   173        summary: DDL pending exceed 1h
   174  
   175    - alert: DM_DDL_error
   176      expr: increase(dm_master_shard_ddl_error[1m]) > 0
   177      labels:
   178        env: ENV_LABELS_ENV
   179        level: critical
   180        expr: increase(dm_master_shard_ddl_error[1m]) > 0
   181      annotations:
   182        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
   183        value: '{{ $value }}'
   184        summary: DDL error happens