github.com/pingcap/ticdc@v0.0.0-20220526033649-485a10ef2652/metrics/alertmanager/ticdc.rules.yml (about)

     1  groups:
     2  - name: alert.rules
     3    rules:
     4    - alert: cdc_multiple_owners
     5      expr: sum(rate(ticdc_owner_ownership_counter[30s])) >= 2
     6      for: 1m
     7      labels:
     8        env: ENV_LABELS_ENV
     9        level: warning
    10        expr: sum(rate(ticdc_owner_ownership_counter[30s])) >= 2
    11      annotations:
    12        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    13        value: '{{ $value }}'
    14        summary: cdc cluster has multiple owners
    15  
    16    - alert: cdc_checkpoint_high_delay
    17      expr: (time() - ticdc_processor_checkpoint_ts / 1000) > 600
    18      for: 1m
    19      labels:
    20        env: ENV_LABELS_ENV
    21        level: critical
    22        expr: (time() - ticdc_processor_checkpoint_ts / 1000) > 600
    23      annotations:
    24        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    25        value: '{{ $value }}'
    26        summary: cdc processor checkpoint delay more than 10 minutes
    27  
    28    - alert: cdc_resolvedts_high_delay
    29      expr: (time() - ticdc_processor_resolved_ts / 1000) > 300
    30      for: 1m
    31      labels:
    32        env: ENV_LABELS_ENV
    33        level: critical
    34        expr: (time() - ticdc_processor_resolved_ts / 1000) > 300
    35      annotations:
    36        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    37        value: '{{ $value }}'
    38        summary: cdc processor resolved ts delay more than 5 minutes
    39  
    40    - alert: ticdc_puller_entry_sorter_sort_duration_time_more_than_2s
    41      expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_sort_bucket[1m])) > 2
    42      for: 1m
    43      labels:
    44        env: ENV_LABELS_ENV
    45        level: warning
    46        expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_sort_bucket[1m])) > 2
    47      annotations:
    48        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    49        value: '{{ $value }}'
    50        summary: ticdc_puller_entry_sorter sort duration time more than 2s
    51  
    52    - alert: ticdc_puller_entry_sorter_merge_duration_time_more_than_2s
    53      expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_merge_bucket[1m])) > 2
    54      for: 1m
    55      labels:
    56        env: ENV_LABELS_ENV
    57        level: warning
    58        expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_merge_bucket[1m])) > 2
    59      annotations:
    60        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    61        value: '{{ $value }}'
    62        summary: ticdc_puller_entry_sorter merge duration time more than 2s
    63  
    64    - alert: ticdc_mounter_unmarshal_and_mount_time_more_than_1s
    65      expr: histogram_quantile(0.9, rate(ticdc_mounter_unmarshal_and_mount_bucket[1m])) * 1000 > 1000
    66      for: 1m
    67      labels:
    68        env: ENV_LABELS_ENV
    69        level: warning
    70        expr: histogram_quantile(0.9, rate(ticdc_mounter_unmarshal_and_mount_bucket[1m])) * 1000 > 1000
    71      annotations:
    72        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    73        value: '{{ $value }}'
    74        summary: cdc_mounter unmarshal and mount time more than 1s
    75  
    76    - alert: cdc_sink_execute_duration_time_more_than_10s
    77      expr: histogram_quantile(0.9, rate(ticdc_sink_txn_exec_duration_bucket[1m])) > 10
    78      for: 1m
    79      labels:
    80        env: ENV_LABELS_ENV
    81        level: warning
    82        expr: histogram_quantile(0.9, rate(ticdc_sink_txn_exec_duration_bucket[1m])) > 10
    83      annotations:
    84        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    85        value: '{{ $value }}'
    86        summary: cdc sink execute_duration_time_more_than_10s
    87  
    88    - alert: cdc_processor_checkpoint_tso_no_change_for_1m
    89      expr: changes(ticdc_processor_checkpoint_ts[1m]) < 1
    90      labels:
    91        env: ENV_LABELS_ENV
    92        level: warning
    93        expr: changes(ticdc_processor_checkpoint_ts[1m]) < 1
    94      annotations:
    95        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    96        value: '{{ $value }}'
    97        summary: cdc processor checkpoint tso no change for 1m
    98  
    99    - alert: ticdc_puller_entry_sorter_sort_bucket
   100      expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_sort_bucket{}[1m])) > 1
   101      for: 1m
   102      labels:
   103        env: ENV_LABELS_ENV
   104        level: warning
   105        expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_sort_bucket{}[1m]))
   106      annotations:
   107        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
   108        value: '{{ $value }}'
   109        summary:  ticdc puller entry sorter sort latency is too high
   110  
   111    - alert: ticdc_puller_entry_sorter_merge_bucket
   112      expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_merge_bucket{}[1m])) > 1
   113      for: 1m
   114      labels:
   115        env: ENV_LABELS_ENV
   116        level: warning
   117        expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_merge_bucket{}[1m]))
   118      annotations:
   119        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
   120        value: '{{ $value }}'
   121        summary:  ticdc puller entry sorter merge latency is too high
   122  
   123    - alert: tikv_cdc_min_resolved_ts_no_change_for_1m
   124      expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1
   125      for: 1m
   126      labels:
   127        env: ENV_LABELS_ENV
   128        level: warning
   129        expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1
   130      annotations:
   131        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
   132        value: '{{ $value }}'
   133        summary: tikv cdc min resolved ts no change for 1m
   134  
   135    - alert: tikv_cdc_scan_duration_seconds_more_than_30s
   136      expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 30
   137      for: 1m
   138      labels:
   139        env: ENV_LABELS_ENV
   140        level: warning
   141        expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 30
   142      annotations:
   143        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
   144        value: '{{ $value }}'
   145        summary: tikv cdc scan duration seconds more than 30s
   146  
   147    - alert: ticdc_sink_mysql_execution_error
   148      expr: changes(ticdc_sink_mysql_execution_error[1m]) > 0
   149      for: 1m
   150      labels:
   151        env: ENV_LABELS_ENV
   152        level: warning
   153        expr: changes(ticdc_sink_mysql_execution_error[1m]) > 0
   154      annotations:
   155        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
   156        value: '{{ $value }}'
   157        summary: cdc sink mysql execution meets errors
   158  
   159    - alert: ticdc_processor_exit_with_error_count
   160      expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
   161      for: 1m
   162      labels:
   163        env: ENV_LABELS_ENV
   164        level: critical
   165        expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
   166      annotations:
   167        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
   168        value: '{{ $value }}'
   169        summary: cdc processor exits with error
   170  
   171    - alert: ticdc_memory_abnormal
   172      expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10
   173      for: 1m
   174      labels:
   175        env: ENV_LABELS_ENV
   176        level: warning
   177        expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10
   178      annotations:
   179        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
   180        value: '{{ $value }}'
   181        summary: TiCDC heap memory usage is over 10 GB
   182  
   183    - alert: tikv_enabled_hibernate_regions
   184      expr: sum(tikv_config_raftstore{name="hibernate_regions"}) > 0
   185      for: 1m
   186      labels:
   187        env: ENV_LABELS_ENV
   188        level: warning
   189        expr: sum(tikv_config_raftstore{name="hibernate_regions"}) > 0
   190      annotations:
   191        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
   192        value: '{{ $value }}'
   193        summary: cdc will break tikv hibernate regions