github.com/ironcore-dev/gardener-extension-provider-ironcore@v0.3.2-0.20240314231816-8336447fb9a0/charts/internal/machine-controller-manager/seed/templates/configmap-monitoring.yaml (about)

     1  apiVersion: v1
     2  kind: ConfigMap
     3  metadata:
     4    name: machine-controller-manager-monitoring-config
     5    namespace: {{ .Release.Namespace }}
     6    labels:
     7      extensions.gardener.cloud/configuration: monitoring
     8  data:
     9    scrape_config: |
    10      - job_name: machine-controller-manager
    11        honor_labels: false
    12        kubernetes_sd_configs:
    13        - role: endpoints
    14          namespaces:
    15            names: [{{ .Release.Namespace }}]
    16        relabel_configs:
    17        - source_labels:
    18          - __meta_kubernetes_service_name
    19          - __meta_kubernetes_endpoint_port_name
    20          action: keep
    21          regex: machine-controller-manager;metrics
    22        # common metrics
    23        - action: labelmap
    24          regex: __meta_kubernetes_service_label_(.+)
    25        - source_labels: [ __meta_kubernetes_pod_name ]
    26          target_label: pod
    27        metric_relabel_configs:
    28        - source_labels: [ __name__ ]
    29          regex: ^(mcm_cloud_api_requests_failed_total|mcm_cloud_api_requests_total|mcm_machine_controller_frozen|mcm_machine_current_status_phase|mcm_machine_deployment_failed_machines|mcm_machine_items_total|mcm_machine_set_failed_machines|mcm_machine_deployment_items_total|mcm_machine_set_items_total|mcm_machine_set_stale_machines_total|mcm_scrape_failure_total|process_max_fds|process_open_fds|mcm_workqueue_adds_total|mcm_workqueue_depth|mcm_workqueue_queue_duration_seconds_bucket|mcm_workqueue_queue_duration_seconds_sum|mcm_workqueue_queue_duration_seconds_count|mcm_workqueue_work_duration_seconds_bucket|mcm_workqueue_work_duration_seconds_sum|mcm_workqueue_work_duration_seconds_count|mcm_workqueue_unfinished_work_seconds|mcm_workqueue_longest_running_processor_seconds|mcm_workqueue_retries_total)$
    30          action: keep
    31  
    32    alerting_rules: |
    33      machine-controller-manager.rules.yaml: |
    34        groups:
    35        - name: machine-controller-manager.rules
    36          rules:
    37          - alert: MachineControllerManagerDown
    38            expr: absent(up{job="machine-controller-manager"} == 1)
    39            for: 15m
    40            labels:
    41              service: machine-controller-manager
    42              severity: critical
    43              type: seed
    44              visibility: operator
    45            annotations:
    46              description: There are no running machine controller manager instances. No shoot nodes can be created/maintained.
    47              summary: Machine controller manager is down.
    48  
    49    dashboard_operators: |
    50      machine-controller-manager-dashboard.json: |-
    51  {{ .Files.Get "mcm-monitoring-dashboard.json" | indent 6 }}