github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/metrics/alertmanager/ticdc.rules.yml (about)

     1  groups:
     2  - name: alert.rules
     3    rules:
     4    # server related alter rules
     5    - alert: cdc_multiple_owners
     6      expr: sum(rate(ticdc_owner_ownership_counter[240s])) >= 0.125
     7      for: 1m
     8      labels:
     9        env: ENV_LABELS_ENV
    10        level: warning
    11        expr: sum(rate(ticdc_owner_ownership_counter[240s])) >= 0.125
    12      annotations:
    13        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    14        value: '{{ $value }}'
    15        summary: cdc cluster has multiple owners
    16  
    17    - alert: cdc_no_owner
    18      expr: sum(rate(ticdc_owner_ownership_counter[240s])) < 0.0625
    19      for: 10m
    20      labels:
    21        env: ENV_LABELS_ENV
    22        level: warning
    23        expr: sum(rate(ticdc_owner_ownership_counter[240s])) < 0.0625
    24      annotations:
    25        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    26        value: '{{ $value }}'
    27        summary: cdc cluster has no owner for more than 10 minutes
    28  
    29    # changefeed related alter rules
    30    - alert: ticdc_changefeed_failed
    31      expr: (max_over_time(ticdc_owner_status[1m]) == 2) > 0
    32      for: 1m
    33      labels:
    34        env: ENV_LABELS_ENV
    35        level: critical
    36        expr: (max_over_time(ticdc_owner_status[1m]) == 2) > 0
    37      annotations:
    38        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    39        value: '{{ $value }}'
    40        summary: cdc changefeed failed, it can not be automatically resumed
    41  
    42    - alert: cdc_checkpoint_high_delay
    43      expr: ticdc_owner_checkpoint_ts_lag > 600
    44      for: 1m
    45      labels:
    46        env: ENV_LABELS_ENV
    47        level: critical
    48        expr: ticdc_owner_checkpoint_ts_lag > 600
    49      annotations:
    50        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    51        value: '{{ $value }}'
    52        summary: cdc owner checkpoint delay more than 10 minutes
    53  
    54    - alert: cdc_resolvedts_high_delay
    55      expr: ticdc_owner_resolved_ts_lag > 300
    56      for: 1m
    57      labels:
    58        env: ENV_LABELS_ENV
    59        level: critical
    60        expr: ticdc_owner_resolved_ts_lag > 300
    61      annotations:
    62        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    63        value: '{{ $value }}'
    64        summary: cdc owner resolved ts delay more than 5 minutes
    65  
    66    - alert: ticdc_sink_execution_error
    67      expr: changes(ticdc_sink_execution_error[1m]) > 0
    68      for: 1m
    69      labels:
    70        env: ENV_LABELS_ENV
    71        level: warning
    72        expr: changes(ticdc_sink_execution_error[1m]) > 0
    73      annotations:
    74        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    75        value: '{{ $value }}'
    76        summary: cdc sink execution meets errors
    77  
    78    - alert: ticdc_processor_exit_with_error_count
    79      expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
    80      for: 1m
    81      labels:
    82        env: ENV_LABELS_ENV
    83        level: warning
    84        expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
    85      annotations:
    86        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    87        value: '{{ $value }}'
    88        summary: cdc processor exits with error
    89  
    90    - alert: ticdc_changefeed_meet_error
    91      expr: (max_over_time(ticdc_owner_status[1m]) == 1) > 0
    92      for: 1m
    93      labels:
    94        env: ENV_LABELS_ENV
    95        level: warning
    96        expr: (max_over_time(ticdc_owner_status[1m]) == 1) > 0
    97      annotations:
    98        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
    99        value: '{{ $value }}'
   100        summary: cdc changefeed meet error
   101    
   102    # tikv related alter rules
   103    - alert: tikv_cdc_min_resolved_ts_no_change_for_1m
   104      expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0 and ON (instance) tikv_cdc_captured_region_total > 0
   105      for: 1m
   106      labels:
   107        env: ENV_LABELS_ENV
   108        level: warning
   109        expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0
   110      annotations:
   111        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
   112        value: '{{ $labels.instance }}'
   113        summary: tikv cdc min resolved ts no change for 1m
   114  
   115    - alert: tikv_cdc_scan_duration_seconds_more_than_10min
   116      expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
   117      for: 1m
   118      labels:
   119        env: ENV_LABELS_ENV
   120        level: warning
   121        expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
   122      annotations:
   123        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
   124        value: '{{ $value }}'
   125        summary: tikv cdc scan duration seconds more than 10 min
   126  
   127    - alert: ticdc_memory_abnormal
   128      expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10
   129      for: 1m
   130      labels:
   131        env: ENV_LABELS_ENV
   132        level: warning
   133        expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10
   134      annotations:
   135        description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
   136        value: '{{ $value }}'
   137        summary: TiCDC heap memory usage is over 10 GB