github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/test/testdata/addon/cm-values.yaml (about)

     1  apiVersion: v1
     2  kind: ConfigMap
     3  metadata:
     4    name: prometheus-chart-kubeblocks-values
     5    namespace: default
     6  data:
     7    values-kubeblocks-override.yaml: |-
     8      alertmanager:
     9        ## If false, alertmanager will not be installed
    10        ##
    11        enabled: true
    12  
    13        ## alertmanager container image
    14        ##
    15        image:
    16          repository: docker.io/apecloud/alertmanager
    17          tag: v0.24.0
    18  
    19        ## Node tolerations for alertmanager scheduling to nodes with taints
    20        ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
    21        ##
    22        tolerations: [ ]
    23          # - key: "key"
    24          #   operator: "Equal|Exists"
    25          #   value: "value"
    26          #   effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)"
    27  
    28        persistentVolume:
    29          ## If true, alertmanager will create/use a Persistent Volume Claim
    30          ## If false, use emptyDir
    31          ##
    32          enabled: true
    33  
    34          ## alertmanager data Persistent Volume size
    35          ##
    36          size: 1Gi
    37  
    38          ## alertmanager data Persistent Volume Storage Class
    39          ## If defined, storageClassName: <storageClass>
    40          ## If set to "-", storageClassName: "", which disables dynamic provisioning
    41          ## If undefined (the default) or set to null, no storageClassName spec is
    42          ##   set, choosing the default provisioner.  (gp2 on AWS, standard on
    43          ##   GKE, AWS & OpenStack)
    44          ##
    45          # storageClass: "-"
    46  
    47        ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below)
    48        ##
    49        replicaCount: 1
    50  
    51        statefulSet:
    52          ## If true, use a statefulset instead of a deployment for pod management.
    53          ## This allows to scale replicas to more than 1 pod
    54          ##
    55          enabled: true
    56  
    57          ## Alertmanager headless service to use for the statefulset
    58          ##
    59          headless:
    60            ## Enabling peer mesh service end points for enabling the HA alert manager
    61            ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md
    62            enableMeshPeer: true
    63  
    64        ## alertmanager resource requests and limits
    65        ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
    66        ##
    67        resources: {}
    68          # limits:
    69          #   cpu: 10m
    70          #   memory: 32Mi
    71          # requests:
    72          #   cpu: 10m
    73        #   memory: 32Mi
    74  
    75        ## Security context to be added to alertmanager pods
    76        ##
    77        securityContext:
    78          runAsUser: 0
    79          runAsNonRoot: false
    80          runAsGroup: 65534
    81          fsGroup: 65534
    82  
    83        containerSecurityContext:
    84          allowPrivilegeEscalation: false
    85  
    86      kubeStateMetrics:
    87        ## If false, kube-state-metrics sub-chart will not be installed
    88        ##
    89        enabled: false
    90  
    91      nodeExporter:
    92        ## If false, node-exporter will not be installed
    93        ##
    94        enabled: false
    95  
    96        ## node-exporter container image
    97        ##
    98        image:
    99          repository: docker.io/apecloud/node-exporter
   100          tag: v1.3.1
   101  
   102      server:
   103        ## Prometheus server container name
   104        ##
   105        enabled: true
   106  
   107        ## Prometheus server container image
   108        ##
   109        image:
   110          repository: docker.io/apecloud/prometheus
   111          tag: v2.39.1
   112  
   113        global:
   114          ## How frequently to scrape targets by default
   115          ##
   116          scrape_interval: 15s
   117          ## How long until a scrape request times out
   118          ##
   119          scrape_timeout: 10s
   120          ## How frequently to evaluate rules
   121          ##
   122          evaluation_interval: 15s
   123  
   124        ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write
   125        ##
   126        remoteWrite: []
   127  
   128        ## Prefix used to register routes, overriding externalUrl route.
   129        ## Useful for proxies that rewrite URLs.
   130        ##
   131        routePrefix: /
   132  
   133        ## Node tolerations for server scheduling to nodes with taints
   134        ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/
   135        ##
   136        tolerations: [ ]
   137          # - key: "key"
   138          #   operator: "Equal|Exists"
   139          #   value: "value"
   140        #   effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)"
   141  
   142        persistentVolume:
   143          ## If true, Prometheus server will create/use a Persistent Volume Claim
   144          ## If false, use emptyDir
   145          ##
   146          enabled: true
   147  
   148          ## Prometheus server data Persistent Volume size
   149          ##
   150          size: 8Gi
   151  
   152          ## Prometheus server data Persistent Volume Storage Class
   153          ## If defined, storageClassName: <storageClass>
   154          ## If set to "-", storageClassName: "", which disables dynamic provisioning
   155          ## If undefined (the default) or set to null, no storageClassName spec is
   156          ##   set, choosing the default provisioner.  (gp2 on AWS, standard on
   157          ##   GKE, AWS & OpenStack)
   158          ##
   159          # storageClass: "-"
   160  
   161        ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below)
   162        ##
   163        replicaCount: 1
   164  
   165        statefulSet:
   166          ## If true, use a statefulset instead of a deployment for pod management.
   167          ## This allows to scale replicas to more than 1 pod
   168          ##
   169          enabled: true
   170  
   171        ## Prometheus server resource requests and limits
   172        ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/
   173        ##
   174        resources: {}
   175          # limits:
   176          #   cpu: 500m
   177          #   memory: 512Mi
   178          # requests:
   179          #   cpu: 500m
   180        #   memory: 512Mi
   181  
   182        ## Prometheus' data retention period (default if not specified is 15 days)
   183        ##
   184        retention: "3d"
   185  
   186        ## Security context to be added to server pods
   187        ##
   188        securityContext:
   189          runAsUser: 0
   190          runAsNonRoot: false
   191          runAsGroup: 65534
   192          fsGroup: 65534
   193  
   194        containerSecurityContext:
   195          allowPrivilegeEscalation: false
   196  
   197      ## Sample prometheus rules/alerts
   198      ## NOTE: Please review these carefully as thresholds and behavior may not meet
   199      ##       your SLOs or labels.
   200      ##
   201      ruleFiles:
   202        cadvisor_alert_rules.yml: |
   203          groups:
   204            - name: GoogleCadvisor
   205              rules:
   206                - alert: ContainerKilled
   207                  expr: 'time() - container_last_seen{container!="",container!="POD"} > 60'
   208                  for: 0m
   209                  labels:
   210                    severity: warning
   211                  annotations:
   212                    summary: "Container killed (node: {{ $labels.instance }}, pod: {{ $labels.pod }}, container: {{ $labels.container }})"
   213                    description: "A container has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   214  
   215                - alert: ContainerCpuUsageWarning
   216                  expr: 'sum(rate(container_cpu_usage_seconds_total{container!="",container!="POD"}[2m])) BY (instance,pod,container) * 100 > 70'
   217                  for: 2m
   218                  labels:
   219                    severity: warning
   220                  annotations:
   221                    summary: "Container CPU usage is high (> 70%) (node: {{ $labels.instance }}, pod: {{ $labels.pod }}, container: {{ $labels.container }})"
   222                    description: "Container CPU usage is above 70%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   223  
   224                - alert: ContainerCpuUsageCritical
   225                  expr: 'sum(rate(container_cpu_usage_seconds_total{container!="",container!="POD"}[2m])) BY (instance,pod,container) * 100 > 90'
   226                  for: 1m
   227                  labels:
   228                    severity: critical
   229                  annotations:
   230                    summary: "Container CPU usage is very high (> 90%) (node: {{ $labels.instance }}, pod: {{ $labels.pod }}, container: {{ $labels.container }})"
   231                    description: "Container CPU usage is above 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   232  
   233                - alert: ContainerMemoryUsage
   234                  expr: 'sum(container_memory_working_set_bytes{container!="",container!="POD"}) BY (instance,pod,container) / sum(container_spec_memory_limit_bytes{container!="",container!="POD"} > 0) BY (instance,pod,container) * 100 > 90'
   235                  for: 2m
   236                  labels:
   237                    severity: warning
   238                  annotations:
   239                    summary: "Container Memory usage is high (> 90%) (node: {{ $labels.instance }}, pod: {{ $labels.pod }}, container: {{ $labels.container }})"
   240                    description: "Container Memory usage is above 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   241  
   242                - alert: ContainerMemoryUsagePredict
   243                  expr: 'sum(predict_linear(container_memory_working_set_bytes{container!="",container!="POD"}[15m], 30*60)) BY (instance,pod,container) - sum(container_spec_memory_limit_bytes{container!="",container!="POD"} > 0) BY (instance,pod,container) >= 0'
   244                  for: 0m
   245                  labels:
   246                    severity: critical
   247                  annotations:
   248                    summary: "Container Memory usage may exceed the limit 30 minutes later (node: {{ $labels.instance }}, pod: {{ $labels.pod }}, container: {{ $labels.container }})"
   249                    description: "Container Memory usage may exceed the limit 30 minutes later\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   250  
   251                - alert: ContainerVolumeUsage
   252                  expr: 'sum(container_fs_usage_bytes) BY (instance,device) / sum(container_fs_limit_bytes) BY (instance,device) * 100 > 90'
   253                  for: 2m
   254                  labels:
   255                    severity: warning
   256                  annotations:
   257                    summary: "Device Volume usage is high (> 90%) (node: {{ $labels.instance }}, device: {{ $labels.device }})"
   258                    description: "Device Volume usage is above 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   259  
   260                - alert: ContainerHighCpuThrottleRate
   261                  expr: 'rate(container_cpu_cfs_throttled_seconds_total{container!="",container!="POD"}[2m]) > 1'
   262                  for: 2m
   263                  labels:
   264                    severity: warning
   265                  annotations:
   266                    summary: "Container high throttle rate (node: {{ $labels.instance }}, pod: {{ $labels.pod }}, container: {{ $labels.container }})"
   267                    description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   268  
   269        mysql_alert_rules.yml: |
   270          groups:
   271            - name: MysqldExporter
   272              rules:
   273                - alert: MysqlDown
   274                  expr: 'max_over_time(mysql_up[1m]) == 0'
   275                  for: 1m
   276                  labels:
   277                    severity: critical
   278                  annotations:
   279                    summary: "MySQL is down (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   280                    description: "MySQL instance is down on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   281  
   282                - alert: MysqlRestarted
   283                  expr: 'mysql_global_status_uptime < 60'
   284                  for: 0m
   285                  labels:
   286                    severity: info
   287                  annotations:
   288                    summary: "MySQL restarted (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   289                    description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   290  
   291                - alert: MysqlTooManyConnections
   292                  expr: 'sum(max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 80'
   293                  for: 2m
   294                  labels:
   295                    severity: warning
   296                  annotations:
   297                    summary: "MySQL too many connections (> 80%) (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   298                    description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   299  
   300                - alert: MysqlConnectionErrors
   301                  expr: 'sum(increase(mysql_global_status_connection_errors_total[1m])) BY (namespace,app_kubernetes_io_instance,pod) > 0'
   302                  for: 2m
   303                  labels:
   304                    severity: warning
   305                  annotations:
   306                    summary: "MySQL connection errors (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   307                    description: "MySQL server has some connection errors on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   308  
   309                - alert: MysqlHighThreadsRunning
   310                  expr: 'sum(max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 60'
   311                  for: 2m
   312                  labels:
   313                    severity: warning
   314                  annotations:
   315                    summary: "MySQL high threads running (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   316                    description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   317  
   318                - alert: MysqlSlowQueries
   319                  expr: 'sum(increase(mysql_global_status_slow_queries[1m])) BY (namespace,app_kubernetes_io_instance,pod) > 0'
   320                  for: 2m
   321                  labels:
   322                    severity: info
   323                  annotations:
   324                    summary: "MySQL slow queries (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   325                    description: "MySQL server has some new slow query on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   326  
   327                - alert: MysqlInnodbLogWaits
   328                  expr: 'sum(rate(mysql_global_status_innodb_log_waits[5m])) BY (namespace,app_kubernetes_io_instance,pod) > 10'
   329                  for: 2m
   330                  labels:
   331                    severity: warning
   332                  annotations:
   333                    summary: "MySQL InnoDB log waits (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   334                    description: "MySQL innodb log writes stalling on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   335  
   336                - alert: MysqlInnodbBufferPoolHits
   337                  expr: 'sum(rate(mysql_global_status_innodb_buffer_pool_reads[5m]) / rate(mysql_global_status_innodb_buffer_pool_read_requests[5m])) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 5'
   338                  for: 2m
   339                  labels:
   340                    severity: warning
   341                  annotations:
   342                    summary: "MySQL InnoDB high read requests rate hitting disk (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   343                    description: "High number of logical reads that InnoDB could not satisfy from the buffer pool, and had to read directly from disk on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   344  
   345        postgresql_alert_rules.yml: |
   346          groups:
   347            - name: PostgreSQLExporter
   348              rules:
   349                - alert: PostgreSQLDown
   350                  expr: 'max_over_time(pg_up[1m]) == 0'
   351                  for: 1m
   352                  labels:
   353                    severity: critical
   354                  annotations:
   355                    summary: "PostgreSQL is down (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   356                    description: "PostgreSQL instance is down on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   357  
   358                - alert: PostgreSQLExporterError
   359                  expr: 'pg_exporter_last_scrape_error > 0'
   360                  for: 0m
   361                  labels:
   362                    severity: warning
   363                  annotations:
   364                    summary: "PostgreSQL exporter scrape error (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   365                    description: "PostgreSQL exporter is showing errors. A query may be buggy in query.yaml\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   366  
   367                - alert: PostgreSQLTooManySlowQueries
   368                  expr: |
   369                    max by(namespace,app_kubernetes_io_instance,pod,datname) (
   370                      max_over_time(pg_stat_activity_max_tx_duration{datname!~"template.*|postgres"}[2m])
   371                    ) > 60
   372                  for: 2m
   373                  labels:
   374                    severity: warning
   375                  annotations:
   376                    summary: "PostgreSQL database {{ $labels.datname }} high number of slow queries (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   377                    description: "PostgreSQL high number of slow queries\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   378  
   379                - alert: PostgreSQLTooManyConnections
   380                  expr: |
   381                    sum by (namespace,app_kubernetes_io_instance,pod) (pg_stat_activity_count{datname!~"template.*|postgres"})
   382                    > on(namespace,app_kubernetes_io_instance,pod)
   383                    (pg_settings_max_connections - pg_settings_superuser_reserved_connections) * 0.8
   384                  for: 2m
   385                  labels:
   386                    severity: warning
   387                  annotations:
   388                    summary: "PostgreSQL too many connections (> 80%) (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   389                    description: "PostgreSQL instance has too many connections (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   390  
   391                - alert: PostgreSQLDeadLocks
   392                  expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres", datname!=""}[2m]) > 5'
   393                  for: 2m
   394                  labels:
   395                    severity: warning
   396                  annotations:
   397                    summary: "PostgreSQL database {{ $labels.datname }} dead locks (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   398                    description: "PostgreSQL has deadlocks\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   399  
   400                - alert: PostgreSQLHighRollbackRate
   401                  expr: |
   402                    rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres", datname!=""}[2m])
   403                    /
   404                    rate(pg_stat_database_xact_commit{datname!~"template.*|postgres", datname!=""}[2m])
   405                    > 0.1
   406                  for: 2m
   407                  labels:
   408                    severity: warning
   409                  annotations:
   410                    summary: "PostgreSQL database {{ $labels.datname }} high rollback rate (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   411                    description: "Ratio of transactions being aborted compared to committed is > 2%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   412  
   413                - alert: PostgreSQLTooManyLocksAcquired
   414                  expr: |
   415                    sum by (namespace,app_kubernetes_io_instance,pod) (pg_locks_count)
   416                    / on(namespace,app_kubernetes_io_instance,pod)
   417                    (pg_settings_max_locks_per_transaction * pg_settings_max_connections)
   418                    > 0.2
   419                  for: 2m
   420                  labels:
   421                    severity: warning
   422                  annotations:
   423                    summary: "PostgreSQL too many locks acquired (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   424                    description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   425  
   426                - alert: PostgreSQLCacheHitRatio
   427                  expr: |
   428                    avg by (namespace,app_kubernetes_io_instance,pod,datname) (
   429                      rate(pg_stat_database_blks_hit{datname!~"template.*|postgres", datname!=""}[2m])
   430                      /
   431                      (
   432                        rate(
   433                          pg_stat_database_blks_hit{datname!~"template.*|postgres", datname!=""}[2m]
   434                        )
   435                        +
   436                        rate(
   437                          pg_stat_database_blks_read{datname!~"template.*|postgres", datname!=""}[2m]
   438                        )
   439                      )
   440                    ) < 0.9
   441                  for: 2m
   442                  labels:
   443                    severity: warning
   444                  annotations:
   445                    summary: "PostgreSQL database {{ $labels.datname }} has low cache hit rate (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   446                    description: "Low cache hit rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   447  
   448                - alert: PostgreSQLMaxWriteBufferReached
   449                  expr: 'rate(pg_stat_bgwriter_maxwritten_clean_total[2m]) > 0'
   450                  for: 2m
   451                  labels:
   452                    severity: warning
   453                  annotations:
   454                    summary: "PostgreSQL write buffers reached max (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   455                    description: "PostgreSQL background writer stops for max\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   456  
   457                - alert: PostgreSQLHighWALFilesArchiveErrorRate
   458                  expr: |
   459                    rate(pg_stat_archiver_failed_count[2m])
   460                    / (
   461                      rate(pg_stat_archiver_archived_count[2m]) + rate(pg_stat_archiver_failed_count[2m])
   462                    ) > 0.1
   463                  for: 2m
   464                  labels:
   465                    severity: warning
   466                  annotations:
   467                    summary: "PostgreSQL high error rate in WAL files archiver (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   468                    description: "PostgreSQL high error rate in WAL files archiver\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   469  
   470                - alert: PostgreSQLTableNotAutoVacuumed
   471                  expr: |
   472                    (pg_stat_user_tables_last_autovacuum > 0)
   473                    and
   474                    (time() - pg_stat_user_tables_last_autovacuum)
   475                    > 24 * 60 * 60 * 10
   476                  for: 0m
   477                  labels:
   478                    severity: warning
   479                  annotations:
   480                    summary: "PostgreSQL table {{ $labels.relname }} in database {{ $labels.datname }} not auto vacuumed (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   481                    description: "Table {{ $labels.relname }} in database {{ $labels.datname }} has not been auto vacuumed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   482  
   483                - alert: PostgreSQLTableNotAutoAnalyzed
   484                  expr: |
   485                    (pg_stat_user_tables_last_autoanalyze > 0)
   486                    and
   487                    (time() - pg_stat_user_tables_last_autoanalyze)
   488                    > 24 * 60 * 60 * 10
   489                  for: 0m
   490                  labels:
   491                    severity: warning
   492                  annotations:
   493                    summary: "PostgreSQL table {{ $labels.relname }} in database {{ $labels.datname }} not auto analyzed (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   494                    description: "Table {{ $labels.relname }} in database {{ $labels.datname }} has not been auto analyzed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   495  
   496                - alert: PostgreSQLTableTooManyDeadTuples
   497                  expr: |
   498                    (pg_stat_user_tables_n_dead_tup > 10000)
   499                    /
   500                    (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)
   501                    >= 0.1
   502                  for: 2m
   503                  labels:
   504                    severity: warning
   505                  annotations:
   506                    summary: "PostgreSQL table {{ $labels.relname }} in database {{ $labels.datname }} has too many dead tuples (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})"
   507                    description: "Table {{ $labels.relname }} in database {{ $labels.datname }} dead tuples is too large\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
   508  
   509      serverFiles:
   510        prometheus.yml:
   511          rule_files:
   512            - /etc/config/recording_rules.yml
   513            - /etc/config/alerting_rules.yml
   514            - /etc/config/cadvisor_alert_rules.yml
   515            - /etc/config/mysql_alert_rules.yml
   516            - /etc/config/postgresql_alert_rules.yml
   517  
   518          scrape_configs:
   519            - job_name: prometheus
   520              static_configs:
   521                - targets:
   522                    - localhost:9090
   523  
   524            # Scrape config for API servers.
   525            #
   526            # Kubernetes exposes API servers as endpoints to the default/kubernetes
   527            # service so this uses `endpoints` role and uses relabelling to only keep
   528            # the endpoints associated with the default/kubernetes service using the
   529            # default named port `https`. This works for single API server deployments as
   530            # well as HA API server deployments.
   531            - job_name: 'kubernetes-apiservers'
   532  
   533              kubernetes_sd_configs:
   534                - role: endpoints
   535  
   536              # Default to scraping over https. If required, just disable this or change to
   537              # `http`.
   538              scheme: https
   539  
   540              # This TLS & bearer token file config is used to connect to the actual scrape
   541              # endpoints for cluster components. This is separate to discovery auth
   542              # configuration because discovery & scraping are two separate concerns in
   543              # Prometheus. The discovery auth config is automatic if Prometheus runs inside
   544              # the cluster. Otherwise, more config options have to be provided within the
   545              # <kubernetes_sd_config>.
   546              tls_config:
   547                ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
   548                # If your node certificates are self-signed or use a different CA to the
   549                # master CA, then disable certificate verification below. Note that
   550                # certificate verification is an integral part of a secure infrastructure
   551                # so this should only be disabled in a controlled environment. You can
   552                # disable certificate verification by uncommenting the line below.
   553                #
   554                insecure_skip_verify: true
   555              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
   556  
   557              # Keep only the default/kubernetes service endpoints for the https port. This
   558              # will add targets for each API server which Kubernetes adds an endpoint to
   559              # the default/kubernetes service.
   560              relabel_configs:
   561                - source_labels: [ __meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name ]
   562                  action: keep
   563                  regex: default;kubernetes;https
   564  
   565            - job_name: 'kubernetes-nodes'
   566  
   567              # Default to scraping over https. If required, just disable this or change to
   568              # `http`.
   569              scheme: https
   570  
   571              # This TLS & bearer token file config is used to connect to the actual scrape
   572              # endpoints for cluster components. This is separate to discovery auth
   573              # configuration because discovery & scraping are two separate concerns in
   574              # Prometheus. The discovery auth config is automatic if Prometheus runs inside
   575              # the cluster. Otherwise, more config options have to be provided within the
   576              # <kubernetes_sd_config>.
   577              tls_config:
   578                ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
   579                # If your node certificates are self-signed or use a different CA to the
   580                # master CA, then disable certificate verification below. Note that
   581                # certificate verification is an integral part of a secure infrastructure
   582                # so this should only be disabled in a controlled environment. You can
   583                # disable certificate verification by uncommenting the line below.
   584                #
   585                insecure_skip_verify: true
   586              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
   587  
   588              kubernetes_sd_configs:
   589                - role: node
   590  
   591              relabel_configs:
   592                - action: labelmap
   593                  regex: __meta_kubernetes_node_label_(.+)
   594                - target_label: __address__
   595                  replacement: kubernetes.default.svc:443
   596                - source_labels: [ __meta_kubernetes_node_name ]
   597                  regex: (.+)
   598                  target_label: __metrics_path__
   599                  replacement: /api/v1/nodes/$1/proxy/metrics
   600  
   601            - job_name: 'kubernetes-nodes-cadvisor'
   602  
   603              # Default to scraping over https. If required, just disable this or change to
   604              # `http`.
   605              scheme: https
   606  
   607              # This TLS & bearer token file config is used to connect to the actual scrape
   608              # endpoints for cluster components. This is separate to discovery auth
   609              # configuration because discovery & scraping are two separate concerns in
   610              # Prometheus. The discovery auth config is automatic if Prometheus runs inside
   611              # the cluster. Otherwise, more config options have to be provided within the
   612              # <kubernetes_sd_config>.
   613              tls_config:
   614                ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
   615                # If your node certificates are self-signed or use a different CA to the
   616                # master CA, then disable certificate verification below. Note that
   617                # certificate verification is an integral part of a secure infrastructure
   618                # so this should only be disabled in a controlled environment. You can
   619                # disable certificate verification by uncommenting the line below.
   620                #
   621                insecure_skip_verify: true
   622              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
   623  
   624              kubernetes_sd_configs:
   625                - role: node
   626  
   627              # This configuration will work only on kubelet 1.7.3+
   628              # As the scrape endpoints for cAdvisor have changed
   629              # if you are using older version you need to change the replacement to
   630              # replacement: /api/v1/nodes/$1:4194/proxy/metrics
   631              # more info here https://github.com/coreos/prometheus-operator/issues/633
   632              relabel_configs:
   633                - action: labelmap
   634                  regex: __meta_kubernetes_node_label_(.+)
   635                - target_label: __address__
   636                  replacement: kubernetes.default.svc:443
   637                - source_labels: [ __meta_kubernetes_node_name ]
   638                  regex: (.+)
   639                  target_label: __metrics_path__
   640                  replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
   641  
   642            # Example scrape config for pods
   643            #
   644            # The relabeling allows the actual pod scrape endpoint to be configured via the
   645            # following annotations:
   646            #
   647            # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`,
   648            # except if `prometheus.io/scrape-slow` is set to `true` as well.
   649            # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
   650            # to set this to `https` & most likely set the `tls_config` of the scrape config.
   651            # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
   652            # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
   653            - job_name: 'kubernetes-pods'
   654              honor_labels: true
   655  
   656              kubernetes_sd_configs:
   657                - role: pod
   658  
   659              relabel_configs:
   660                - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape ]
   661                  action: keep
   662                  regex: true
   663                - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ]
   664                  action: drop
   665                  regex: true
   666                - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ]
   667                  action: replace
   668                  regex: (https?)
   669                  target_label: __scheme__
   670                - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ]
   671                  action: replace
   672                  target_label: __metrics_path__
   673                  regex: (.+)
   674                - source_labels: [ __address__, __meta_kubernetes_pod_annotation_prometheus_io_port ]
   675                  action: replace
   676                  regex: (.+?)(?::\d+)?;(\d+)
   677                  replacement: $1:$2
   678                  target_label: __address__
   679                - action: labelmap
   680                  regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
   681                  replacement: __param_$1
   682                - action: labeldrop
   683                  regex: __meta_kubernetes_pod_label_controller_(.+)
   684                - action: labeldrop
   685                  regex: __meta_kubernetes_pod_label_statefulset_(.+)
   686                - action: labeldrop
   687                  regex: __meta_kubernetes_pod_label_cs_(.+)
   688                - action: labelmap
   689                  regex: __meta_kubernetes_pod_label_(.+)
   690                - source_labels: [ __meta_kubernetes_namespace ]
   691                  action: replace
   692                  target_label: namespace
   693                - source_labels: [ __meta_kubernetes_pod_name ]
   694                  action: replace
   695                  target_label: pod
   696                - source_labels: [ __meta_kubernetes_pod_phase ]
   697                  regex: Pending|Succeeded|Failed|Completed
   698                  action: drop
   699  
   700            # Example Scrape config for pods which should be scraped slower. An useful example
   701            # would be stackriver-exporter which queries an API on every scrape of the pod
   702            #
   703            # The relabeling allows the actual pod scrape endpoint to be configured via the
   704            # following annotations:
   705            #
   706            # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true`
   707            # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
   708            # to set this to `https` & most likely set the `tls_config` of the scrape config.
   709            # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
   710            # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
   711            - job_name: 'kubernetes-pods-slow'
   712              honor_labels: true
   713  
   714              scrape_interval: 5m
   715              scrape_timeout: 30s
   716  
   717              kubernetes_sd_configs:
   718                - role: pod
   719  
   720              relabel_configs:
   721                - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ]
   722                  action: keep
   723                  regex: true
   724                - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ]
   725                  action: replace
   726                  regex: (https?)
   727                  target_label: __scheme__
   728                - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ]
   729                  action: replace
   730                  target_label: __metrics_path__
   731                  regex: (.+)
   732                - source_labels: [ __address__, __meta_kubernetes_pod_annotation_prometheus_io_port ]
   733                  action: replace
   734                  regex: (.+?)(?::\d+)?;(\d+)
   735                  replacement: $1:$2
   736                  target_label: __address__
   737                - action: labelmap
   738                  regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
   739                  replacement: __param_$1
   740                - action: labeldrop
   741                  regex: __meta_kubernetes_pod_label_controller_(.+)
   742                - action: labeldrop
   743                  regex: __meta_kubernetes_pod_label_statefulset_(.+)
   744                - action: labeldrop
   745                  regex: __meta_kubernetes_pod_label_cs_(.+)
   746                - action: labelmap
   747                  regex: __meta_kubernetes_pod_label_(.+)
   748                - source_labels: [ __meta_kubernetes_namespace ]
   749                  action: replace
   750                  target_label: namespace
   751                - source_labels: [ __meta_kubernetes_pod_name ]
   752                  action: replace
   753                  target_label: pod
   754                - source_labels: [ __meta_kubernetes_pod_phase ]
   755                  regex: Pending|Succeeded|Failed|Completed
   756                  action: drop
   757  
   758      pushgateway:
   759        ## If false, pushgateway will not be installed
   760        ##
   761        enabled: false