github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/test/testdata/addon/cm-values.yaml (about) 1 apiVersion: v1 2 kind: ConfigMap 3 metadata: 4 name: prometheus-chart-kubeblocks-values 5 namespace: default 6 data: 7 values-kubeblocks-override.yaml: |- 8 alertmanager: 9 ## If false, alertmanager will not be installed 10 ## 11 enabled: true 12 13 ## alertmanager container image 14 ## 15 image: 16 repository: docker.io/apecloud/alertmanager 17 tag: v0.24.0 18 19 ## Node tolerations for alertmanager scheduling to nodes with taints 20 ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ 21 ## 22 tolerations: [ ] 23 # - key: "key" 24 # operator: "Equal|Exists" 25 # value: "value" 26 # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" 27 28 persistentVolume: 29 ## If true, alertmanager will create/use a Persistent Volume Claim 30 ## If false, use emptyDir 31 ## 32 enabled: true 33 34 ## alertmanager data Persistent Volume size 35 ## 36 size: 1Gi 37 38 ## alertmanager data Persistent Volume Storage Class 39 ## If defined, storageClassName: <storageClass> 40 ## If set to "-", storageClassName: "", which disables dynamic provisioning 41 ## If undefined (the default) or set to null, no storageClassName spec is 42 ## set, choosing the default provisioner. (gp2 on AWS, standard on 43 ## GKE, AWS & OpenStack) 44 ## 45 # storageClass: "-" 46 47 ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below) 48 ## 49 replicaCount: 1 50 51 statefulSet: 52 ## If true, use a statefulset instead of a deployment for pod management. 53 ## This allows to scale replicas to more than 1 pod 54 ## 55 enabled: true 56 57 ## Alertmanager headless service to use for the statefulset 58 ## 59 headless: 60 ## Enabling peer mesh service end points for enabling the HA alert manager 61 ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md 62 enableMeshPeer: true 63 64 ## alertmanager resource requests and limits 65 ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ 66 ## 67 resources: {} 68 # limits: 69 # cpu: 10m 70 # memory: 32Mi 71 # requests: 72 # cpu: 10m 73 # memory: 32Mi 74 75 ## Security context to be added to alertmanager pods 76 ## 77 securityContext: 78 runAsUser: 0 79 runAsNonRoot: false 80 runAsGroup: 65534 81 fsGroup: 65534 82 83 containerSecurityContext: 84 allowPrivilegeEscalation: false 85 86 kubeStateMetrics: 87 ## If false, kube-state-metrics sub-chart will not be installed 88 ## 89 enabled: false 90 91 nodeExporter: 92 ## If false, node-exporter will not be installed 93 ## 94 enabled: false 95 96 ## node-exporter container image 97 ## 98 image: 99 repository: docker.io/apecloud/node-exporter 100 tag: v1.3.1 101 102 server: 103 ## Prometheus server container name 104 ## 105 enabled: true 106 107 ## Prometheus server container image 108 ## 109 image: 110 repository: docker.io/apecloud/prometheus 111 tag: v2.39.1 112 113 global: 114 ## How frequently to scrape targets by default 115 ## 116 scrape_interval: 15s 117 ## How long until a scrape request times out 118 ## 119 scrape_timeout: 10s 120 ## How frequently to evaluate rules 121 ## 122 evaluation_interval: 15s 123 124 ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write 125 ## 126 remoteWrite: [] 127 128 ## Prefix used to register routes, overriding externalUrl route. 129 ## Useful for proxies that rewrite URLs. 130 ## 131 routePrefix: / 132 133 ## Node tolerations for server scheduling to nodes with taints 134 ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ 135 ## 136 tolerations: [ ] 137 # - key: "key" 138 # operator: "Equal|Exists" 139 # value: "value" 140 # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" 141 142 persistentVolume: 143 ## If true, Prometheus server will create/use a Persistent Volume Claim 144 ## If false, use emptyDir 145 ## 146 enabled: true 147 148 ## Prometheus server data Persistent Volume size 149 ## 150 size: 8Gi 151 152 ## Prometheus server data Persistent Volume Storage Class 153 ## If defined, storageClassName: <storageClass> 154 ## If set to "-", storageClassName: "", which disables dynamic provisioning 155 ## If undefined (the default) or set to null, no storageClassName spec is 156 ## set, choosing the default provisioner. (gp2 on AWS, standard on 157 ## GKE, AWS & OpenStack) 158 ## 159 # storageClass: "-" 160 161 ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below) 162 ## 163 replicaCount: 1 164 165 statefulSet: 166 ## If true, use a statefulset instead of a deployment for pod management. 167 ## This allows to scale replicas to more than 1 pod 168 ## 169 enabled: true 170 171 ## Prometheus server resource requests and limits 172 ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ 173 ## 174 resources: {} 175 # limits: 176 # cpu: 500m 177 # memory: 512Mi 178 # requests: 179 # cpu: 500m 180 # memory: 512Mi 181 182 ## Prometheus' data retention period (default if not specified is 15 days) 183 ## 184 retention: "3d" 185 186 ## Security context to be added to server pods 187 ## 188 securityContext: 189 runAsUser: 0 190 runAsNonRoot: false 191 runAsGroup: 65534 192 fsGroup: 65534 193 194 containerSecurityContext: 195 allowPrivilegeEscalation: false 196 197 ## Sample prometheus rules/alerts 198 ## NOTE: Please review these carefully as thresholds and behavior may not meet 199 ## your SLOs or labels. 200 ## 201 ruleFiles: 202 cadvisor_alert_rules.yml: | 203 groups: 204 - name: GoogleCadvisor 205 rules: 206 - alert: ContainerKilled 207 expr: 'time() - container_last_seen{container!="",container!="POD"} > 60' 208 for: 0m 209 labels: 210 severity: warning 211 annotations: 212 summary: "Container killed (node: {{ $labels.instance }}, pod: {{ $labels.pod }}, container: {{ $labels.container }})" 213 description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 214 215 - alert: ContainerCpuUsageWarning 216 expr: 'sum(rate(container_cpu_usage_seconds_total{container!="",container!="POD"}[2m])) BY (instance,pod,container) * 100 > 70' 217 for: 2m 218 labels: 219 severity: warning 220 annotations: 221 summary: "Container CPU usage is high (> 70%) (node: {{ $labels.instance }}, pod: {{ $labels.pod }}, container: {{ $labels.container }})" 222 description: "Container CPU usage is above 70%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 223 224 - alert: ContainerCpuUsageCritical 225 expr: 'sum(rate(container_cpu_usage_seconds_total{container!="",container!="POD"}[2m])) BY (instance,pod,container) * 100 > 90' 226 for: 1m 227 labels: 228 severity: critical 229 annotations: 230 summary: "Container CPU usage is very high (> 90%) (node: {{ $labels.instance }}, pod: {{ $labels.pod }}, container: {{ $labels.container }})" 231 description: "Container CPU usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 232 233 - alert: ContainerMemoryUsage 234 expr: 'sum(container_memory_working_set_bytes{container!="",container!="POD"}) BY (instance,pod,container) / sum(container_spec_memory_limit_bytes{container!="",container!="POD"} > 0) BY (instance,pod,container) * 100 > 90' 235 for: 2m 236 labels: 237 severity: warning 238 annotations: 239 summary: "Container Memory usage is high (> 90%) (node: {{ $labels.instance }}, pod: {{ $labels.pod }}, container: {{ $labels.container }})" 240 description: "Container Memory usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 241 242 - alert: ContainerMemoryUsagePredict 243 expr: 'sum(predict_linear(container_memory_working_set_bytes{container!="",container!="POD"}[15m], 30*60)) BY (instance,pod,container) - sum(container_spec_memory_limit_bytes{container!="",container!="POD"} > 0) BY (instance,pod,container) >= 0' 244 for: 0m 245 labels: 246 severity: critical 247 annotations: 248 summary: "Container Memory usage may exceed the limit 30 minutes later (node: {{ $labels.instance }}, pod: {{ $labels.pod }}, container: {{ $labels.container }})" 249 description: "Container Memory usage may exceed the limit 30 minutes later\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 250 251 - alert: ContainerVolumeUsage 252 expr: 'sum(container_fs_usage_bytes) BY (instance,device) / sum(container_fs_limit_bytes) BY (instance,device) * 100 > 90' 253 for: 2m 254 labels: 255 severity: warning 256 annotations: 257 summary: "Device Volume usage is high (> 90%) (node: {{ $labels.instance }}, device: {{ $labels.device }})" 258 description: "Device Volume usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 259 260 - alert: ContainerHighCpuThrottleRate 261 expr: 'rate(container_cpu_cfs_throttled_seconds_total{container!="",container!="POD"}[2m]) > 1' 262 for: 2m 263 labels: 264 severity: warning 265 annotations: 266 summary: "Container high throttle rate (node: {{ $labels.instance }}, pod: {{ $labels.pod }}, container: {{ $labels.container }})" 267 description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 268 269 mysql_alert_rules.yml: | 270 groups: 271 - name: MysqldExporter 272 rules: 273 - alert: MysqlDown 274 expr: 'max_over_time(mysql_up[1m]) == 0' 275 for: 1m 276 labels: 277 severity: critical 278 annotations: 279 summary: "MySQL is down (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 280 description: "MySQL instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 281 282 - alert: MysqlRestarted 283 expr: 'mysql_global_status_uptime < 60' 284 for: 0m 285 labels: 286 severity: info 287 annotations: 288 summary: "MySQL restarted (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 289 description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 290 291 - alert: MysqlTooManyConnections 292 expr: 'sum(max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 80' 293 for: 2m 294 labels: 295 severity: warning 296 annotations: 297 summary: "MySQL too many connections (> 80%) (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 298 description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 299 300 - alert: MysqlConnectionErrors 301 expr: 'sum(increase(mysql_global_status_connection_errors_total[1m])) BY (namespace,app_kubernetes_io_instance,pod) > 0' 302 for: 2m 303 labels: 304 severity: warning 305 annotations: 306 summary: "MySQL connection errors (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 307 description: "MySQL server has some connection errors on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 308 309 - alert: MysqlHighThreadsRunning 310 expr: 'sum(max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 60' 311 for: 2m 312 labels: 313 severity: warning 314 annotations: 315 summary: "MySQL high threads running (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 316 description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 317 318 - alert: MysqlSlowQueries 319 expr: 'sum(increase(mysql_global_status_slow_queries[1m])) BY (namespace,app_kubernetes_io_instance,pod) > 0' 320 for: 2m 321 labels: 322 severity: info 323 annotations: 324 summary: "MySQL slow queries (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 325 description: "MySQL server has some new slow query on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 326 327 - alert: MysqlInnodbLogWaits 328 expr: 'sum(rate(mysql_global_status_innodb_log_waits[5m])) BY (namespace,app_kubernetes_io_instance,pod) > 10' 329 for: 2m 330 labels: 331 severity: warning 332 annotations: 333 summary: "MySQL InnoDB log waits (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 334 description: "MySQL innodb log writes stalling on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 335 336 - alert: MysqlInnodbBufferPoolHits 337 expr: 'sum(rate(mysql_global_status_innodb_buffer_pool_reads[5m]) / rate(mysql_global_status_innodb_buffer_pool_read_requests[5m])) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 5' 338 for: 2m 339 labels: 340 severity: warning 341 annotations: 342 summary: "MySQL InnoDB high read requests rate hitting disk (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 343 description: "High number of logical reads that InnoDB could not satisfy from the buffer pool, and had to read directly from disk on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 344 345 postgresql_alert_rules.yml: | 346 groups: 347 - name: PostgreSQLExporter 348 rules: 349 - alert: PostgreSQLDown 350 expr: 'max_over_time(pg_up[1m]) == 0' 351 for: 1m 352 labels: 353 severity: critical 354 annotations: 355 summary: "PostgreSQL is down (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 356 description: "PostgreSQL instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 357 358 - alert: PostgreSQLExporterError 359 expr: 'pg_exporter_last_scrape_error > 0' 360 for: 0m 361 labels: 362 severity: warning 363 annotations: 364 summary: "PostgreSQL exporter scrape error (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 365 description: "PostgreSQL exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 366 367 - alert: PostgreSQLTooManySlowQueries 368 expr: | 369 max by(namespace,app_kubernetes_io_instance,pod,datname) ( 370 max_over_time(pg_stat_activity_max_tx_duration{datname!~"template.*|postgres"}[2m]) 371 ) > 60 372 for: 2m 373 labels: 374 severity: warning 375 annotations: 376 summary: "PostgreSQL database {{ $labels.datname }} high number of slow queries (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 377 description: "PostgreSQL high number of slow queries\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 378 379 - alert: PostgreSQLTooManyConnections 380 expr: | 381 sum by (namespace,app_kubernetes_io_instance,pod) (pg_stat_activity_count{datname!~"template.*|postgres"}) 382 > on(namespace,app_kubernetes_io_instance,pod) 383 (pg_settings_max_connections - pg_settings_superuser_reserved_connections) * 0.8 384 for: 2m 385 labels: 386 severity: warning 387 annotations: 388 summary: "PostgreSQL too many connections (> 80%) (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 389 description: "PostgreSQL instance has too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 390 391 - alert: PostgreSQLDeadLocks 392 expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres", datname!=""}[2m]) > 5' 393 for: 2m 394 labels: 395 severity: warning 396 annotations: 397 summary: "PostgreSQL database {{ $labels.datname }} dead locks (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 398 description: "PostgreSQL has deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 399 400 - alert: PostgreSQLHighRollbackRate 401 expr: | 402 rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres", datname!=""}[2m]) 403 / 404 rate(pg_stat_database_xact_commit{datname!~"template.*|postgres", datname!=""}[2m]) 405 > 0.1 406 for: 2m 407 labels: 408 severity: warning 409 annotations: 410 summary: "PostgreSQL database {{ $labels.datname }} high rollback rate (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 411 description: "Ratio of transactions being aborted compared to committed is > 2%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 412 413 - alert: PostgreSQLTooManyLocksAcquired 414 expr: | 415 sum by (namespace,app_kubernetes_io_instance,pod) (pg_locks_count) 416 / on(namespace,app_kubernetes_io_instance,pod) 417 (pg_settings_max_locks_per_transaction * pg_settings_max_connections) 418 > 0.2 419 for: 2m 420 labels: 421 severity: warning 422 annotations: 423 summary: "PostgreSQL too many locks acquired (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 424 description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 425 426 - alert: PostgreSQLCacheHitRatio 427 expr: | 428 avg by (namespace,app_kubernetes_io_instance,pod,datname) ( 429 rate(pg_stat_database_blks_hit{datname!~"template.*|postgres", datname!=""}[2m]) 430 / 431 ( 432 rate( 433 pg_stat_database_blks_hit{datname!~"template.*|postgres", datname!=""}[2m] 434 ) 435 + 436 rate( 437 pg_stat_database_blks_read{datname!~"template.*|postgres", datname!=""}[2m] 438 ) 439 ) 440 ) < 0.9 441 for: 2m 442 labels: 443 severity: warning 444 annotations: 445 summary: "PostgreSQL database {{ $labels.datname }} has low cache hit rate (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 446 description: "Low cache hit rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 447 448 - alert: PostgreSQLMaxWriteBufferReached 449 expr: 'rate(pg_stat_bgwriter_maxwritten_clean_total[2m]) > 0' 450 for: 2m 451 labels: 452 severity: warning 453 annotations: 454 summary: "PostgreSQL write buffers reached max (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 455 description: "PostgreSQL background writer stops for max\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 456 457 - alert: PostgreSQLHighWALFilesArchiveErrorRate 458 expr: | 459 rate(pg_stat_archiver_failed_count[2m]) 460 / ( 461 rate(pg_stat_archiver_archived_count[2m]) + rate(pg_stat_archiver_failed_count[2m]) 462 ) > 0.1 463 for: 2m 464 labels: 465 severity: warning 466 annotations: 467 summary: "PostgreSQL high error rate in WAL files archiver (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 468 description: "PostgreSQL high error rate in WAL files archiver\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 469 470 - alert: PostgreSQLTableNotAutoVacuumed 471 expr: | 472 (pg_stat_user_tables_last_autovacuum > 0) 473 and 474 (time() - pg_stat_user_tables_last_autovacuum) 475 > 24 * 60 * 60 * 10 476 for: 0m 477 labels: 478 severity: warning 479 annotations: 480 summary: "PostgreSQL table {{ $labels.relname }} in database {{ $labels.datname }} not auto vacuumed (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 481 description: "Table {{ $labels.relname }} in database {{ $labels.datname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 482 483 - alert: PostgreSQLTableNotAutoAnalyzed 484 expr: | 485 (pg_stat_user_tables_last_autoanalyze > 0) 486 and 487 (time() - pg_stat_user_tables_last_autoanalyze) 488 > 24 * 60 * 60 * 10 489 for: 0m 490 labels: 491 severity: warning 492 annotations: 493 summary: "PostgreSQL table {{ $labels.relname }} in database {{ $labels.datname }} not auto analyzed (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 494 description: "Table {{ $labels.relname }} in database {{ $labels.datname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 495 496 - alert: PostgreSQLTableTooManyDeadTuples 497 expr: | 498 (pg_stat_user_tables_n_dead_tup > 10000) 499 / 500 (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) 501 >= 0.1 502 for: 2m 503 labels: 504 severity: warning 505 annotations: 506 summary: "PostgreSQL table {{ $labels.relname }} in database {{ $labels.datname }} has too many dead tuples (namespace: {{ $labels.namespace }}, cluster: {{ $labels.app_kubernetes_io_instance }}, instance: {{ $labels.pod }})" 507 description: "Table {{ $labels.relname }} in database {{ $labels.datname }} dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 508 509 serverFiles: 510 prometheus.yml: 511 rule_files: 512 - /etc/config/recording_rules.yml 513 - /etc/config/alerting_rules.yml 514 - /etc/config/cadvisor_alert_rules.yml 515 - /etc/config/mysql_alert_rules.yml 516 - /etc/config/postgresql_alert_rules.yml 517 518 scrape_configs: 519 - job_name: prometheus 520 static_configs: 521 - targets: 522 - localhost:9090 523 524 # Scrape config for API servers. 525 # 526 # Kubernetes exposes API servers as endpoints to the default/kubernetes 527 # service so this uses `endpoints` role and uses relabelling to only keep 528 # the endpoints associated with the default/kubernetes service using the 529 # default named port `https`. This works for single API server deployments as 530 # well as HA API server deployments. 531 - job_name: 'kubernetes-apiservers' 532 533 kubernetes_sd_configs: 534 - role: endpoints 535 536 # Default to scraping over https. If required, just disable this or change to 537 # `http`. 538 scheme: https 539 540 # This TLS & bearer token file config is used to connect to the actual scrape 541 # endpoints for cluster components. This is separate to discovery auth 542 # configuration because discovery & scraping are two separate concerns in 543 # Prometheus. The discovery auth config is automatic if Prometheus runs inside 544 # the cluster. Otherwise, more config options have to be provided within the 545 # <kubernetes_sd_config>. 546 tls_config: 547 ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 548 # If your node certificates are self-signed or use a different CA to the 549 # master CA, then disable certificate verification below. Note that 550 # certificate verification is an integral part of a secure infrastructure 551 # so this should only be disabled in a controlled environment. You can 552 # disable certificate verification by uncommenting the line below. 553 # 554 insecure_skip_verify: true 555 bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 556 557 # Keep only the default/kubernetes service endpoints for the https port. This 558 # will add targets for each API server which Kubernetes adds an endpoint to 559 # the default/kubernetes service. 560 relabel_configs: 561 - source_labels: [ __meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name ] 562 action: keep 563 regex: default;kubernetes;https 564 565 - job_name: 'kubernetes-nodes' 566 567 # Default to scraping over https. If required, just disable this or change to 568 # `http`. 569 scheme: https 570 571 # This TLS & bearer token file config is used to connect to the actual scrape 572 # endpoints for cluster components. This is separate to discovery auth 573 # configuration because discovery & scraping are two separate concerns in 574 # Prometheus. The discovery auth config is automatic if Prometheus runs inside 575 # the cluster. Otherwise, more config options have to be provided within the 576 # <kubernetes_sd_config>. 577 tls_config: 578 ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 579 # If your node certificates are self-signed or use a different CA to the 580 # master CA, then disable certificate verification below. Note that 581 # certificate verification is an integral part of a secure infrastructure 582 # so this should only be disabled in a controlled environment. You can 583 # disable certificate verification by uncommenting the line below. 584 # 585 insecure_skip_verify: true 586 bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 587 588 kubernetes_sd_configs: 589 - role: node 590 591 relabel_configs: 592 - action: labelmap 593 regex: __meta_kubernetes_node_label_(.+) 594 - target_label: __address__ 595 replacement: kubernetes.default.svc:443 596 - source_labels: [ __meta_kubernetes_node_name ] 597 regex: (.+) 598 target_label: __metrics_path__ 599 replacement: /api/v1/nodes/$1/proxy/metrics 600 601 - job_name: 'kubernetes-nodes-cadvisor' 602 603 # Default to scraping over https. If required, just disable this or change to 604 # `http`. 605 scheme: https 606 607 # This TLS & bearer token file config is used to connect to the actual scrape 608 # endpoints for cluster components. This is separate to discovery auth 609 # configuration because discovery & scraping are two separate concerns in 610 # Prometheus. The discovery auth config is automatic if Prometheus runs inside 611 # the cluster. Otherwise, more config options have to be provided within the 612 # <kubernetes_sd_config>. 613 tls_config: 614 ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt 615 # If your node certificates are self-signed or use a different CA to the 616 # master CA, then disable certificate verification below. Note that 617 # certificate verification is an integral part of a secure infrastructure 618 # so this should only be disabled in a controlled environment. You can 619 # disable certificate verification by uncommenting the line below. 620 # 621 insecure_skip_verify: true 622 bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token 623 624 kubernetes_sd_configs: 625 - role: node 626 627 # This configuration will work only on kubelet 1.7.3+ 628 # As the scrape endpoints for cAdvisor have changed 629 # if you are using older version you need to change the replacement to 630 # replacement: /api/v1/nodes/$1:4194/proxy/metrics 631 # more info here https://github.com/coreos/prometheus-operator/issues/633 632 relabel_configs: 633 - action: labelmap 634 regex: __meta_kubernetes_node_label_(.+) 635 - target_label: __address__ 636 replacement: kubernetes.default.svc:443 637 - source_labels: [ __meta_kubernetes_node_name ] 638 regex: (.+) 639 target_label: __metrics_path__ 640 replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor 641 642 # Example scrape config for pods 643 # 644 # The relabeling allows the actual pod scrape endpoint to be configured via the 645 # following annotations: 646 # 647 # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`, 648 # except if `prometheus.io/scrape-slow` is set to `true` as well. 649 # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need 650 # to set this to `https` & most likely set the `tls_config` of the scrape config. 651 # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. 652 # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. 653 - job_name: 'kubernetes-pods' 654 honor_labels: true 655 656 kubernetes_sd_configs: 657 - role: pod 658 659 relabel_configs: 660 - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape ] 661 action: keep 662 regex: true 663 - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ] 664 action: drop 665 regex: true 666 - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ] 667 action: replace 668 regex: (https?) 669 target_label: __scheme__ 670 - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ] 671 action: replace 672 target_label: __metrics_path__ 673 regex: (.+) 674 - source_labels: [ __address__, __meta_kubernetes_pod_annotation_prometheus_io_port ] 675 action: replace 676 regex: (.+?)(?::\d+)?;(\d+) 677 replacement: $1:$2 678 target_label: __address__ 679 - action: labelmap 680 regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) 681 replacement: __param_$1 682 - action: labeldrop 683 regex: __meta_kubernetes_pod_label_controller_(.+) 684 - action: labeldrop 685 regex: __meta_kubernetes_pod_label_statefulset_(.+) 686 - action: labeldrop 687 regex: __meta_kubernetes_pod_label_cs_(.+) 688 - action: labelmap 689 regex: __meta_kubernetes_pod_label_(.+) 690 - source_labels: [ __meta_kubernetes_namespace ] 691 action: replace 692 target_label: namespace 693 - source_labels: [ __meta_kubernetes_pod_name ] 694 action: replace 695 target_label: pod 696 - source_labels: [ __meta_kubernetes_pod_phase ] 697 regex: Pending|Succeeded|Failed|Completed 698 action: drop 699 700 # Example Scrape config for pods which should be scraped slower. An useful example 701 # would be stackriver-exporter which queries an API on every scrape of the pod 702 # 703 # The relabeling allows the actual pod scrape endpoint to be configured via the 704 # following annotations: 705 # 706 # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true` 707 # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need 708 # to set this to `https` & most likely set the `tls_config` of the scrape config. 709 # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. 710 # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. 711 - job_name: 'kubernetes-pods-slow' 712 honor_labels: true 713 714 scrape_interval: 5m 715 scrape_timeout: 30s 716 717 kubernetes_sd_configs: 718 - role: pod 719 720 relabel_configs: 721 - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow ] 722 action: keep 723 regex: true 724 - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_scheme ] 725 action: replace 726 regex: (https?) 727 target_label: __scheme__ 728 - source_labels: [ __meta_kubernetes_pod_annotation_prometheus_io_path ] 729 action: replace 730 target_label: __metrics_path__ 731 regex: (.+) 732 - source_labels: [ __address__, __meta_kubernetes_pod_annotation_prometheus_io_port ] 733 action: replace 734 regex: (.+?)(?::\d+)?;(\d+) 735 replacement: $1:$2 736 target_label: __address__ 737 - action: labelmap 738 regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) 739 replacement: __param_$1 740 - action: labeldrop 741 regex: __meta_kubernetes_pod_label_controller_(.+) 742 - action: labeldrop 743 regex: __meta_kubernetes_pod_label_statefulset_(.+) 744 - action: labeldrop 745 regex: __meta_kubernetes_pod_label_cs_(.+) 746 - action: labelmap 747 regex: __meta_kubernetes_pod_label_(.+) 748 - source_labels: [ __meta_kubernetes_namespace ] 749 action: replace 750 target_label: namespace 751 - source_labels: [ __meta_kubernetes_pod_name ] 752 action: replace 753 target_label: pod 754 - source_labels: [ __meta_kubernetes_pod_phase ] 755 regex: Pending|Succeeded|Failed|Completed 756 action: drop 757 758 pushgateway: 759 ## If false, pushgateway will not be installed 760 ## 761 enabled: false