github.com/1aal/kubeblocks@v0.0.0-20231107070852-e1c03e598921/deploy/helm/values.yaml (about) 1 ## @section Common parameters 2 ## 3 4 versionOverride: 5 6 ## KubeBlocks container image settings 7 ## 8 ## @param image.registry KubeBlocks image registry 9 ## @param image.repository KubeBlocks image repository 10 ## @param image.pullPolicy KubeBlocks image pull policy 11 ## @param image.tag KubeBlocks image tag (immutable tags are recommended) 12 ## @param image.imagePullSecrets KubeBlocks image pull secrets 13 ## @param image.tools.repository KubeBlocks tools image repository 14 image: 15 registry: infracreate-registry.cn-zhangjiakou.cr.aliyuncs.com 16 repository: apecloud/kubeblocks 17 pullPolicy: IfNotPresent 18 # Overrides the image tag whose default is the chart appVersion. 19 tag: "" 20 imagePullSecrets: [] 21 tools: 22 repository: apecloud/kubeblocks-tools 23 datascript: 24 repository: apecloud/kubeblocks-datascript 25 26 ## @param replicaCount 27 ## 28 replicaCount: 1 29 30 ## @param nameOverride 31 ## 32 nameOverride: "" 33 34 ## @param fullnameOverride 35 ## 36 fullnameOverride: "" 37 38 39 ## KubeBlocks RBAC access priority setting 40 ## 41 ## @param rbac.enabled is used to enable or disable KubeBlocks RBAC access priority. 42 ## By enabling this feature, KubeBlocks can ensure resource accessibility for the 43 ## cluster's pods, which are required to efficiently manage the cluster. By default, 44 ## it is set to true. When RBAC access priority is enabled, KubeBlocks will have 45 ## the following permissions: 46 ## groups=core,resources=serviceaccounts,verbs=get;list;watch;create;update;patch;delete 47 ## groups=core,resources=serviceaccounts/status,verbs=get;update;patch 48 ## groups=core,resources=serviceaccounts/finalizers,verbs=update 49 ## 50 ## groups=rbac.authorization.k8s.io,resources=rolebindings,verbs=get;list;watch;create;update;patch;delete 51 ## groups=rbac.authorization.k8s.io,resources=rolebindings/status,verbs=get;update;patch 52 ## groups=rbac.authorization.k8s.io,resources=rolebindings/finalizers,verbs=update 53 ## 54 ## groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=get;list;watch;create;update;patch;delete 55 ## groups=rbac.authorization.k8s.io,resources=clusterrolebindings/status,verbs=get;update;patch 56 ## groups=rbac.authorization.k8s.io,resources=clusterrolebindings/finalizers,verbs=update 57 ## 58 ## If it is set to false, then you will need to create the service account 59 ## named `cluster.ComponentSpec.ServiceAccountName` and the corresponding (cluster) role binding 60 ## manually or through the cluster's Helm template, as shown in the example: 61 ## helm install mysql apecloud-mysql-cluster 62 rbac: 63 enabled: true 64 65 ## Deployment update strategy. 66 ## Ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy 67 ## 68 ## @param updateStrategy.rollingUpdate 69 ## @param updateStrategy.type 70 updateStrategy: 71 rollingUpdate: 72 maxSurge: 1 73 maxUnavailable: 40% 74 type: RollingUpdate 75 76 ## Change `hostNetwork` to `true` when you want the KubeBlocks's pod to share its host's network namespace. 77 ## Useful for situations like when you end up dealing with a custom CNI over Amazon EKS. 78 ## Update the `dnsPolicy` accordingly as well to suit the host network mode. 79 ## 80 ## @param hostNetwork 81 ## 82 hostNetwork: false 83 84 ## `dnsPolicy` determines the manner in which DNS resolution happens in the cluster. 85 ## In case of `hostNetwork: true`, usually, the `dnsPolicy` is suitable to be `ClusterFirstWithHostNet`. 86 ## For further reference: https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#pod-s-dns-policy. 87 ## 88 ## @param dnsPolicy 89 ## 90 dnsPolicy: ClusterFirst 91 92 ## Configure podDisruptionBudget spec settings 93 ## 94 ## @param podDisruptionBudget.minAvailable 95 ## @param podDisruptionBudget.maxUnavailable 96 podDisruptionBudget: 97 # Configures the minimum available pods for KubeBlocks disruptions. 98 # Cannot be used if `maxUnavailable` is set. 99 minAvailable: 1 100 # Configures the maximum unavailable pods for KubeBlocks disruptions. 101 # Cannot be used if `minAvailable` is set. 102 maxUnavailable: 103 104 105 ## Logger settings 106 ## 107 ## @param loggerSettings.developmentMode 108 ## @param loggerSettings.encoder 109 ## @param loggerSettings.level 110 ## @param loggerSettings.timeEncoding 111 loggerSettings: 112 # Development Mode defaults(encoder=consoleEncoder,logLevel=Debug,stackTraceLevel=Warn). 113 # Production Mode defaults(encoder=jsonEncoder,logLevel=Info,stackTraceLevel=Error) (default false) 114 developmentMode: false 115 # log encoding (one of 'json' or 'console') 116 encoder: console 117 # log level, can be one of 'debug', 'info', 'error', or any integer value > 0 118 # which corresponds to custom debug levels of increasing verbosity. 119 level: 120 # Zap time encoding (one of 'epoch', 'millis', 'nano', 'iso8601', 'rfc3339' or 121 # 'rfc3339nano'). Defaults to 'iso8601'. 122 timeEncoding: 'iso8601' 123 124 ## ServiceAccount settings 125 ## 126 ## @param serviceAccount.create 127 ## @param serviceAccount.annotations 128 ## @param serviceAccount.name 129 serviceAccount: 130 # Specifies whether a service account should be created 131 create: true 132 # Annotations to add to the service account 133 annotations: {} 134 # The name of the service account to use. 135 # If not set and create is true, a name is generated using the fullname template 136 name: "" 137 138 ## @param podAnnotations 139 ## 140 podAnnotations: {} 141 142 ## Security context settings 143 ## 144 ## @param securityContext.allowPrivilegeEscalation 145 ## @param securityContext.capabilities 146 securityContext: 147 allowPrivilegeEscalation: false 148 capabilities: 149 drop: 150 - ALL 151 152 ## Pod security context settings 153 ## 154 ## @param podSecurityContext.runAsNonRoot 155 ## @param podSecurityContext.readOnlyRootFilesystem 156 ## @param podSecurityContext.runAsUser 157 ## @param podSecurityContext.fsGroup 158 ## @param podSecurityContext.seccompProfile 159 podSecurityContext: 160 runAsNonRoot: true 161 # readOnlyRootFilesystem: true 162 # runAsUser: 1000 163 # fsGroup: 2000 164 # TODO(user): For common cases that do not require escalating privileges 165 # it is recommended to ensure that all your Pods/Containers are restrictive. 166 # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted 167 # Please uncomment the following code if your project does NOT have to work on old Kubernetes 168 # versions < 1.19 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ). 169 # seccompProfile: 170 # type: RuntimeDefault 171 172 ## Service settings 173 ## 174 ## @param service.type 175 ## @param service.port 176 ## @param service.nodePort 177 service: 178 type: ClusterIP 179 port: 9443 180 # -- Service node port. 181 # Only used if `service.type` is `NodePort`. 182 nodePort: 183 184 185 ## Metrics serviceMonitor parameters 186 ## Enable this if you're using Prometheus Operator 187 ## 188 ## @param serviceMonitor.enabled 189 ## @param serviceMonitor.port 190 ## @param serviceMonitor.nodePort 191 serviceMonitor: 192 enabled: false 193 # metrics server will be exposed at this port. 194 port: 8080 195 # Only used if `service.type` is `NodePort`. 196 nodePort: 197 198 ## KubeBlocks pods deployment topologySpreadConstraints settings 199 ## 200 ## @param topologySpreadConstraints 201 topologySpreadConstraints: [] 202 203 204 ## Resource settings 205 ## 206 ## @param resources.limits 207 ## @param resources.requests 208 resources: {} 209 # We usually recommend not to specify default resources and to leave this as a conscious 210 # choice for the user. This also increases chances charts run on environments with little 211 # resources, such as Minikube. If you do want to specify resources, uncomment the following 212 # lines, adjust them as necessary, and remove the curly braces after 'resources:'. 213 # TODO(user): Configure the resources accordingly based on the project requirements. 214 # More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ 215 # limits: 216 # cpu: 500m 217 # memory: 128Mi 218 # requests: 219 # cpu: 10m 220 # memory: 64Mi 221 222 ## @param priorityClassName 223 ## 224 priorityClassName: 225 226 ## Autoscaling settings 227 ## 228 ## @param autoscaling.enabled 229 ## @param autoscaling.minReplicas 230 ## @param autoscaling.maxReplicas 231 ## @param autoscaling.targetCPUUtilizationPercentage 232 ## @param autoscaling.targetMemoryUtilizationPercentage 233 autoscaling: 234 enabled: false 235 minReplicas: 1 236 maxReplicas: 100 237 targetCPUUtilizationPercentage: 80 238 # targetMemoryUtilizationPercentage: 80 239 240 241 242 ## @param nodeSelector 243 ## 244 nodeSelector: {} 245 246 ## @param tolerations 247 ## 248 tolerations: 249 - key: kb-controller 250 operator: Equal 251 value: "true" 252 effect: NoSchedule 253 254 255 ## @param affinity 256 ## 257 affinity: 258 nodeAffinity: 259 preferredDuringSchedulingIgnoredDuringExecution: 260 - weight: 100 261 preference: 262 matchExpressions: 263 - key: kb-controller 264 operator: In 265 values: 266 - "true" 267 268 ## @param data plane settings 269 ## 270 dataPlane: 271 tolerations: 272 - key: kb-data 273 operator: Equal 274 value: "true" 275 effect: NoSchedule 276 277 affinity: 278 nodeAffinity: 279 preferredDuringSchedulingIgnoredDuringExecution: 280 - weight: 100 281 preference: 282 matchExpressions: 283 - key: kb-data 284 operator: In 285 values: 286 - "true" 287 288 ## AdmissionWebhooks settings 289 ## 290 ## @param admissionWebhooks.enabled 291 ## @param admissionWebhooks.createSelfSignedCert 292 ## @param admissionWebhooks.ignoreReplicasCheck 293 admissionWebhooks: 294 enabled: false 295 createSelfSignedCert: true 296 ignoreReplicasCheck: false 297 298 ## Data protection settings 299 ## 300 ## @param dataProtection.enabled - set the dataProtection controllers for backup functions 301 ## @param dataProtection.gcFrequencySeconds - the frequency of garbage collection 302 dataProtection: 303 enabled: true 304 # customizing the encryption key is strongly recommended. 305 # if you do not specify a custom key, the default key will be used. 306 # using the default key can potentially lead to the exposure of database passwords 307 # if 'get/list' role of the backup CR are compromised. 308 encryptionKey: "" 309 gcFrequencySeconds: 3600 310 311 image: 312 registry: infracreate-registry.cn-zhangjiakou.cr.aliyuncs.com 313 repository: apecloud/kubeblocks-dataprotection 314 pullPolicy: IfNotPresent 315 # Overrides the image tag whose default is the chart appVersion. 316 tag: "" 317 imagePullSecrets: [] 318 datasafed: 319 repository: apecloud/datasafed 320 tag: "0.0.3" 321 322 ## BackupRepo settings 323 ## 324 ## @param backupRepo.create - creates a backup repo during installation 325 ## @param backupRepo.default - set the created repo as the default 326 ## @param backupRepo.accessMethod - the access method for the backup repo, options: [Mount, Tool] 327 ## @param backupRepo.storageProvider - the storage provider used by the repo, options: [s3, oss, minio] 328 ## @param backupRepo.pvReclaimPolicy - the PV reclaim policy, options: [Retain, Delete] 329 ## @param backupRepo.volumeCapacity - the capacity for creating PVC 330 ## @param backupRepo.config - a key-value map containing the settings required by the storage provider 331 ## @param backupRepo.secrets - a key-value map containing the secret values required by the storage provider 332 backupRepo: 333 create: false 334 default: true 335 accessMethod: Tool 336 storageProvider: "" 337 pvReclaimPolicy: "Retain" 338 volumeCapacity: "" 339 config: 340 bucket: "" 341 endpoint: "" 342 region: "" 343 secrets: 344 accessKeyId: "" 345 secretAccessKey: "" 346 347 ## Addon controller settings, this will require cluster-admin clusterrole. 348 ## 349 ## @param addonController.enabled 350 ## @param addonController.jobTTL - is addon job time-to-live period, this value is time.Duration-parseable string. 351 ## default value is "5m" if not provided. 352 ## @param addonController.jobImagePullPolicy - addon install job image pull policy. 353 addonController: 354 enabled: true 355 jobTTL: "5m" 356 jobImagePullPolicy: IfNotPresent 357 358 359 ## @param keepAddons - keep Addon CR objects when delete this chart. 360 keepAddons: false 361 362 ## @param addonChartLocationBase - KubeBlocks official addon's chart location base, to be released in an air-gapped environment. 363 ## if url has prefix "file://", KubeBlocks will use the helm charts copied from the addonChartsImage. 364 ## 365 addonChartLocationBase: file:// 366 367 ## @param addonChartsImage - addon charts image, used to copy Helm charts to the addon job container. 368 ## @param addonChartsImage.chartsPath - the helm charts path in the addon charts image. 369 addonChartsImage: 370 registry: infracreate-registry.cn-zhangjiakou.cr.aliyuncs.com 371 repository: apecloud/kubeblocks-charts 372 pullPolicy: IfNotPresent 373 tag: "" 374 chartsPath: /charts 375 376 ## @param addonHelmInstallOptions - addon helm install options. 377 addonHelmInstallOptions: 378 - "--atomic" 379 - "--cleanup-on-fail" 380 - "--wait" 381 - "--insecure-skip-tls-verify" 382 383 ## Prometheus Addon 384 ## 385 prometheus: 386 ## If false, prometheus sub-chart will not be installed 387 ## 388 enabled: false 389 390 alertmanager: 391 ## If false, alertmanager will not be installed 392 ## 393 enabled: true 394 395 ## alertmanager container image 396 ## 397 image: 398 repository: infracreate-registry.cn-zhangjiakou.cr.aliyuncs.com/apecloud/alertmanager 399 tag: v0.24.0 400 401 ## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.alertmanager.configMapOverrideName}} 402 ## Defining configMapOverrideName will cause templates/alertmanager-configmap.yaml 403 ## to NOT generate a ConfigMap resource 404 ## 405 configMapOverrideName: "alertmanager-config" 406 407 ## Node tolerations for alertmanager scheduling to nodes with taints 408 ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ 409 ## 410 tolerations: 411 - key: kb-controller 412 operator: Equal 413 value: "true" 414 effect: NoSchedule 415 416 affinity: 417 nodeAffinity: 418 preferredDuringSchedulingIgnoredDuringExecution: 419 - weight: 100 420 preference: 421 matchExpressions: 422 - key: kb-controller 423 operator: In 424 values: 425 - "true" 426 427 persistentVolume: 428 ## If true, alertmanager will create/use a Persistent Volume Claim 429 ## If false, use emptyDir 430 ## 431 enabled: false 432 433 ## alertmanager data Persistent Volume size 434 ## 435 size: 1Gi 436 437 ## alertmanager data Persistent Volume Storage Class 438 ## If defined, storageClassName: <storageClass> 439 ## If set to "-", storageClassName: "", which disables dynamic provisioning 440 ## If undefined (the default) or set to null, no storageClassName spec is 441 ## set, choosing the default provisioner. (gp2 on AWS, standard on 442 ## GKE, AWS & OpenStack) 443 ## 444 # storageClass: "-" 445 446 ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below) 447 ## 448 replicaCount: 1 449 450 statefulSet: 451 ## If true, use a statefulset instead of a deployment for pod management. 452 ## This allows to scale replicas to more than 1 pod 453 ## 454 enabled: true 455 456 ## Alertmanager headless service to use for the statefulset 457 ## 458 headless: 459 ## Enabling peer mesh service end points for enabling the HA alert manager 460 ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md 461 enableMeshPeer: true 462 463 ## alertmanager resource requests and limits 464 ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ 465 ## 466 resources: {} 467 # limits: 468 # cpu: 10m 469 # memory: 32Mi 470 # requests: 471 # cpu: 10m 472 # memory: 32Mi 473 474 ## Security context to be added to alertmanager pods 475 ## 476 securityContext: 477 runAsUser: 0 478 runAsNonRoot: false 479 runAsGroup: 65534 480 fsGroup: 65534 481 482 containerSecurityContext: 483 allowPrivilegeEscalation: false 484 485 ingress: 486 ## If true, alertmanager Ingress will be created 487 ## 488 enabled: false 489 490 # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName 491 # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress 492 # ingressClassName: nginx 493 494 ## alertmanager Ingress annotations 495 ## 496 annotations: {} 497 # kubernetes.io/ingress.class: nginx 498 # kubernetes.io/tls-acme: 'true' 499 500 ## alertmanager Ingress additional labels 501 ## 502 extraLabels: {} 503 504 ## alertmanager Ingress hostnames with optional path 505 ## Must be provided if Ingress is enabled 506 ## 507 hosts: [] 508 # - alertmanager.domain.com 509 # - domain.com/alertmanager 510 511 path: / 512 513 # pathType is only for k8s >= 1.18 514 pathType: Prefix 515 516 ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. 517 extraPaths: [] 518 # - path: /* 519 # backend: 520 # serviceName: ssl-redirect 521 # servicePort: use-annotation 522 523 ## alertmanager Ingress TLS configuration 524 ## Secrets must be manually created in the namespace 525 ## 526 tls: [] 527 # - secretName: prometheus-alerts-tls 528 # hosts: 529 # - alertmanager.domain.com 530 531 service: 532 annotations: {} 533 labels: {} 534 clusterIP: "" 535 536 ## Enabling peer mesh service end points for enabling the HA alert manager 537 ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md 538 # enableMeshPeer : true 539 540 ## List of IP addresses at which the alertmanager service is available 541 ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips 542 ## 543 externalIPs: [] 544 545 loadBalancerIP: "" 546 loadBalancerSourceRanges: [] 547 servicePort: 80 548 # nodePort: 30000 549 sessionAffinity: None 550 type: ClusterIP 551 552 553 kubeStateMetrics: 554 ## If false, kube-state-metrics sub-chart will not be installed 555 ## 556 enabled: false 557 558 nodeExporter: 559 ## If false, node-exporter will not be installed 560 ## 561 enabled: false 562 563 ## node-exporter container image 564 ## 565 image: 566 repository: infracreate-registry.cn-zhangjiakou.cr.aliyuncs.com/apecloud/node-exporter 567 tag: v1.3.1 568 569 configmapReload: 570 prometheus: 571 ## configmap-reload container image 572 ## 573 image: 574 repository: infracreate-registry.cn-zhangjiakou.cr.aliyuncs.com/apecloud/configmap-reload 575 tag: v0.5.0 576 alertmanager: 577 ## configmap-reload container image 578 ## 579 image: 580 repository: infracreate-registry.cn-zhangjiakou.cr.aliyuncs.com/apecloud/configmap-reload 581 tag: v0.5.0 582 583 server: 584 ## Prometheus server container name 585 ## 586 enabled: true 587 588 ## Prometheus server container image 589 ## 590 image: 591 repository: infracreate-registry.cn-zhangjiakou.cr.aliyuncs.com/apecloud/prometheus 592 tag: v2.44.0 593 594 global: 595 ## How frequently to scrape targets by default 596 ## 597 scrape_interval: 15s 598 ## How long until a scrape request times out 599 ## 600 scrape_timeout: 10s 601 ## How frequently to evaluate rules 602 ## 603 evaluation_interval: 15s 604 605 ## Additional Prometheus server container flags 606 ## 607 extraFlags: 608 - web.enable-lifecycle 609 - web.enable-remote-write-receiver 610 611 ## Additional Prometheus server container arguments 612 ## 613 extraArgs: 614 log.level: info 615 storage.tsdb.min-block-duration: 30m 616 enable-feature: memory-snapshot-on-shutdown 617 storage.tsdb.retention.size: 10GB 618 619 ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write 620 ## 621 remoteWrite: [] 622 623 ## Prefix used to register routes, overriding externalUrl route. 624 ## Useful for proxies that rewrite URLs. 625 ## 626 routePrefix: / 627 628 ## Node tolerations for server scheduling to nodes with taints 629 ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ 630 ## 631 tolerations: 632 - key: kb-controller 633 operator: Equal 634 value: "true" 635 effect: NoSchedule 636 637 affinity: 638 nodeAffinity: 639 preferredDuringSchedulingIgnoredDuringExecution: 640 - weight: 100 641 preference: 642 matchExpressions: 643 - key: kb-controller 644 operator: In 645 values: 646 - "true" 647 648 persistentVolume: 649 ## If true, Prometheus server will create/use a Persistent Volume Claim 650 ## If false, use emptyDir 651 ## 652 enabled: false 653 654 ## Prometheus server data Persistent Volume size 655 ## 656 size: 20Gi 657 658 ## Prometheus server data Persistent Volume Storage Class 659 ## If defined, storageClassName: <storageClass> 660 ## If set to "-", storageClassName: "", which disables dynamic provisioning 661 ## If undefined (the default) or set to null, no storageClassName spec is 662 ## set, choosing the default provisioner. (gp2 on AWS, standard on 663 ## GKE, AWS & OpenStack) 664 ## 665 # storageClass: "-" 666 667 ## Use a StatefulSet if replicaCount needs to be greater than 1 (see below) 668 ## 669 replicaCount: 1 670 671 statefulSet: 672 ## If true, use a statefulset instead of a deployment for pod management. 673 ## This allows to scale replicas to more than 1 pod 674 ## 675 enabled: true 676 677 ## Prometheus server resource requests and limits 678 ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ 679 ## 680 resources: {} 681 # limits: 682 # cpu: 500m 683 # memory: 512Mi 684 # requests: 685 # cpu: 500m 686 # memory: 512Mi 687 688 ## Prometheus' data retention period (default if not specified is 15 days) 689 ## 690 retention: "2d" 691 692 ## Security context to be added to server pods 693 ## 694 securityContext: 695 runAsUser: 0 696 runAsNonRoot: false 697 runAsGroup: 65534 698 fsGroup: 65534 699 700 containerSecurityContext: 701 allowPrivilegeEscalation: false 702 703 service: 704 ## If false, no Service will be created for the Prometheus server 705 ## 706 enabled: true 707 708 annotations: {} 709 labels: {} 710 clusterIP: "" 711 712 ## List of IP addresses at which the Prometheus server service is available 713 ## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips 714 ## 715 externalIPs: [] 716 717 loadBalancerIP: "" 718 loadBalancerSourceRanges: [] 719 servicePort: 80 720 sessionAffinity: None 721 type: ClusterIP 722 723 ## Enable gRPC port on service to allow auto discovery with thanos-querier 724 gRPC: 725 enabled: false 726 servicePort: 10901 727 # nodePort: 10901 728 729 ## If using a statefulSet (statefulSet.enabled=true), configure the 730 ## service to connect to a specific replica to have a consistent view 731 ## of the data. 732 statefulsetReplica: 733 enabled: false 734 replica: 0 735 736 ingress: 737 ## If true, Prometheus server Ingress will be created 738 ## 739 enabled: false 740 741 # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName 742 # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress 743 # ingressClassName: nginx 744 745 ## Prometheus server Ingress annotations 746 ## 747 annotations: {} 748 # kubernetes.io/ingress.class: nginx 749 # kubernetes.io/tls-acme: 'true' 750 751 ## Prometheus server Ingress additional labels 752 ## 753 extraLabels: {} 754 755 ## Prometheus server Ingress hostnames with optional path 756 ## Must be provided if Ingress is enabled 757 ## 758 hosts: [] 759 # - prometheus.domain.com 760 # - domain.com/prometheus 761 762 path: / 763 764 # pathType is only for k8s >= 1.18 765 pathType: Prefix 766 767 ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. 768 extraPaths: [] 769 # - path: /* 770 # backend: 771 # serviceName: ssl-redirect 772 # servicePort: use-annotation 773 774 ## Prometheus server Ingress TLS configuration 775 ## Secrets must be manually created in the namespace 776 ## 777 tls: [] 778 # - secretName: prometheus-server-tls 779 # hosts: 780 # - prometheus.domain.com 781 782 783 784 785 ## AlertManager ConfigMap Entries 786 ## NOTE: Please review these carefully as thresholds and behavior may not meet 787 ## your SLOs or labels. 788 ## 789 alertmanagerFiles: 790 alertmanager.yml: 791 global: {} 792 793 receivers: 794 - name: default-receiver 795 796 route: 797 receiver: default-receiver 798 group_wait: 5s 799 group_interval: 30s 800 repeat_interval: 10m 801 802 ## Sample prometheus rules/alerts 803 ## NOTE: Please review these carefully as thresholds and behavior may not meet 804 ## your SLOs or labels. 805 ## 806 ruleFiles: 807 kubelet_alert_rules.yml: | 808 groups: 809 - name: KubeletSummary 810 rules: 811 - alert: ContainerCpuUsageWarning 812 expr: 'rate(container_cpu_time_seconds_total[2m]) / container_cpu_limit * 100 > 70' 813 for: 2m 814 labels: 815 severity: warning 816 annotations: 817 summary: 'Container CPU usage is high (> 70%)' 818 description: 'Container CPU usage is {{ $value | printf "%.2f" }} percent. (pod: {{ $labels.k8s_pod_name }}, container: {{ $labels.k8s_container_name }})' 819 820 - alert: ContainerCpuUsageCritical 821 expr: 'rate(container_cpu_time_seconds_total[2m]) / container_cpu_limit * 100 > 90' 822 for: 1m 823 labels: 824 severity: critical 825 annotations: 826 summary: 'Container CPU usage is very high (> 90%)' 827 description: 'Container CPU usage is {{ $value | printf "%.2f" }} percent. (pod: {{ $labels.k8s_pod_name }}, container: {{ $labels.k8s_container_name }})' 828 829 - alert: ContainerMemoryUsage 830 expr: 'container_memory_working_set_bytes / container_memory_limit_bytes * 100 > 90' 831 for: 2m 832 labels: 833 severity: warning 834 annotations: 835 summary: 'Container Memory usage is high (> 90%)' 836 description: 'Container Memory usage is {{ $value | printf "%.2f" }} percent. (pod: {{ $labels.k8s_pod_name }}, container: {{ $labels.k8s_container_name }})' 837 838 - alert: ContainerMemoryUsagePredict 839 expr: 'predict_linear(container_memory_working_set_bytes[15m], 30*60) - container_memory_limit_bytes > 0' 840 for: 5m 841 labels: 842 severity: critical 843 annotations: 844 summary: 'Container Memory predict usage may exceed the limit 30 minutes later' 845 description: 'Container Memory predict usage may exceed the limit 30 minutes later, the predict value is {{ $value | humanize1024 }}. (pod: {{ $labels.k8s_pod_name }}, container: {{ $labels.k8s_container_name }})' 846 847 - alert: ContainerVolumeUsage 848 expr: '(k8s_volume_capacity_bytes - k8s_volume_available_bytes) / k8s_volume_capacity_bytes * 100 > 90' 849 for: 2m 850 labels: 851 severity: warning 852 annotations: 853 summary: 'Volume usage is high (> 90%)' 854 description: 'Volume usage is {{ $value | printf "%.2f" }} percent. (pod: {{ $labels.k8s_pod_name }}, volume: {{ $labels.k8s_volume_name }})' 855 856 mysql_alert_rules.yml: | 857 groups: 858 - name: MysqldExporter 859 rules: 860 - alert: MysqlDown 861 expr: 'max_over_time(mysql_up[1m]) == 0' 862 for: 0m 863 labels: 864 severity: critical 865 annotations: 866 summary: 'MySQL is down' 867 description: 'MySQL is down. (instance: {{ $labels.pod }})' 868 869 - alert: MysqlRestarted 870 expr: 'mysql_global_status_uptime < 60' 871 for: 0m 872 labels: 873 severity: info 874 annotations: 875 summary: 'MySQL has just been restarted (< 60s)' 876 description: 'MySQL has just been restarted {{ $value | printf "%.1f" }} seconds ago. (instance: {{ $labels.pod }})' 877 878 - alert: MysqlTooManyConnections 879 expr: 'sum(max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 80' 880 for: 2m 881 labels: 882 severity: warning 883 annotations: 884 summary: 'MySQL has too many connections (> 80%)' 885 description: '{{ $value | printf "%.2f" }} percent of MySQL connections are in use. (instance: {{ $labels.pod }})' 886 887 - alert: MysqlConnectionErrors 888 expr: 'sum(increase(mysql_global_status_connection_errors_total[1m])) BY (namespace,app_kubernetes_io_instance,pod) > 0' 889 for: 2m 890 labels: 891 severity: warning 892 annotations: 893 summary: 'MySQL connection errors' 894 description: 'MySQL has connection errors and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }})' 895 896 - alert: MysqlHighThreadsRunning 897 expr: 'sum(max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 60' 898 for: 2m 899 labels: 900 severity: warning 901 annotations: 902 summary: 'MySQL high threads running (> 60%)' 903 description: '{{ $value | printf "%.2f" }} percent of MySQL connections are in running state. (instance: {{ $labels.pod }})' 904 905 - alert: MysqlSlowQueries 906 expr: 'sum(increase(mysql_global_status_slow_queries[1m])) BY (namespace,app_kubernetes_io_instance,pod) > 0' 907 for: 2m 908 labels: 909 severity: info 910 annotations: 911 summary: 'MySQL slow queries' 912 description: 'MySQL server has {{ $value | printf "%.2f" }} slow query. (instance: {{ $labels.pod }})' 913 914 - alert: MysqlInnodbLogWaits 915 expr: 'sum(rate(mysql_global_status_innodb_log_waits[5m])) BY (namespace,app_kubernetes_io_instance,pod) > 10' 916 for: 2m 917 labels: 918 severity: warning 919 annotations: 920 summary: 'MySQL InnoDB log waits (> 10)' 921 description: 'MySQL innodb log writes stalling and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }})' 922 923 - alert: MysqlInnodbBufferPoolHits 924 expr: 'sum(rate(mysql_global_status_innodb_buffer_pool_reads[5m]) / rate(mysql_global_status_innodb_buffer_pool_read_requests[5m])) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 5' 925 for: 2m 926 labels: 927 severity: warning 928 annotations: 929 summary: 'MySQL InnoDB high read requests rate hitting disk (> 5%)' 930 description: 'High number of logical reads that InnoDB could not satisfy from the buffer pool, and had to read directly from disk. The value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' 931 932 postgresql_alert_rules.yml: | 933 groups: 934 - name: PostgreSQLExporter 935 rules: 936 - alert: PostgreSQLDown 937 expr: 'max_over_time(pg_up[1m]) == 0' 938 for: 0m 939 labels: 940 severity: critical 941 annotations: 942 summary: 'PostgreSQL is down' 943 description: 'PostgreSQL is down. (instance: {{ $labels.pod }})' 944 945 - alert: PostgreSQLRestarted 946 expr: 'time() - pg_postmaster_start_time_seconds < 60' 947 for: 0m 948 labels: 949 severity: info 950 annotations: 951 summary: 'PostgreSQL has just been restarted (< 60s)' 952 description: 'PostgreSQL has just been restarted {{ $value | printf "%.1f" }} seconds ago. (instance: {{ $labels.pod }})' 953 954 - alert: PostgreSQLExporterError 955 expr: 'pg_exporter_last_scrape_error > 0' 956 for: 0m 957 labels: 958 severity: warning 959 annotations: 960 summary: 'PostgreSQL exporter scrape error' 961 description: 'PostgreSQL exporter has {{ $value | printf "%.2f" }} scrape errors. A query may be buggy in query.yaml. (instance: {{ $labels.pod }})' 962 963 - alert: PostgreSQLTooManySlowQueries 964 expr: | 965 max by(namespace,app_kubernetes_io_instance,pod,datname) ( 966 max_over_time(pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m]) 967 ) > 60 968 for: 2m 969 labels: 970 severity: warning 971 annotations: 972 summary: 'PostgreSQL database has high number of slow queries' 973 description: 'PostgreSQL database has slow queries and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' 974 975 - alert: PostgreSQLTooManyConnections 976 expr: | 977 sum by (namespace,app_kubernetes_io_instance,pod) (pg_stat_activity_count{datname!~"template.*"}) 978 > on(namespace,app_kubernetes_io_instance,pod) 979 (pg_settings_max_connections - pg_settings_superuser_reserved_connections) * 0.8 980 for: 2m 981 labels: 982 severity: warning 983 annotations: 984 summary: 'PostgreSQL too many connections (> 80%)' 985 description: 'PostgreSQL has too many connections and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' 986 987 - alert: PostgreSQLDeadLocks 988 expr: 'increase(pg_stat_database_deadlocks_total{datname!~"template.*", datname!=""}[2m]) > 5' 989 for: 2m 990 labels: 991 severity: warning 992 annotations: 993 summary: 'PostgreSQL database has dead locks (> 5)' 994 description: 'PostgreSQL database has {{ $value | printf "%.2f"}} dead locks. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' 995 996 - alert: PostgreSQLHighRollbackRate 997 expr: | 998 rate(pg_stat_database_xact_rollback_total{datname!~"template.*", datname!=""}[2m]) 999 / 1000 rate(pg_stat_database_xact_commit_total{datname!~"template.*", datname!=""}[2m]) 1001 > 0.1 1002 for: 2m 1003 labels: 1004 severity: warning 1005 annotations: 1006 summary: 'PostgreSQL database has high rollback rate (> 10%)' 1007 description: 'Ratio of transactions being aborted compared to committed is {{ $value | printf "%.2f"}} percent. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' 1008 1009 - alert: PostgreSQLTooManyLocksAcquired 1010 expr: | 1011 sum by (namespace,app_kubernetes_io_instance,pod) (pg_locks_count) 1012 / on(namespace,app_kubernetes_io_instance,pod) 1013 (pg_settings_max_locks_per_transaction * pg_settings_max_connections) 1014 > 0.2 1015 for: 2m 1016 labels: 1017 severity: warning 1018 annotations: 1019 summary: 'PostgreSQL has too many locks acquired (> 20%)' 1020 description: 'Too many locks acquired on the database and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' 1021 1022 - alert: PostgreSQLCacheHitRatio 1023 expr: | 1024 avg by (namespace,app_kubernetes_io_instance,pod,datname) ( 1025 rate(pg_stat_database_blks_hit_total{datname!~"template.*", datname!=""}[2m]) 1026 / 1027 ( 1028 rate( 1029 pg_stat_database_blks_hit_total{datname!~"template.*", datname!=""}[2m] 1030 ) 1031 + 1032 rate( 1033 pg_stat_database_blks_read_total{datname!~"template.*", datname!=""}[2m] 1034 ) 1035 ) 1036 ) < 0.9 1037 for: 2m 1038 labels: 1039 severity: warning 1040 annotations: 1041 summary: 'PostgreSQL database has low cache hit rate (< 90%)' 1042 description: 'Low cache hit rate and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' 1043 1044 - alert: PostgreSQLMaxWriteBufferReached 1045 expr: 'rate(pg_stat_bgwriter_maxwritten_clean_total[2m]) > 0' 1046 for: 2m 1047 labels: 1048 severity: warning 1049 annotations: 1050 summary: 'PostgreSQL write buffers reached max' 1051 description: 'PostgreSQL background writer stops for max and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }})' 1052 1053 - alert: PostgreSQLHighWALFilesArchiveErrorRate 1054 expr: | 1055 rate(pg_stat_archiver_failed_count_total[2m]) 1056 / ( 1057 rate(pg_stat_archiver_archived_count_total[2m]) + rate(pg_stat_archiver_failed_count_total[2m]) 1058 ) > 0.1 1059 for: 2m 1060 labels: 1061 severity: warning 1062 annotations: 1063 summary: 'PostgreSQL has high error rate in WAL files archiver(> 10%)' 1064 description: 'PostgreSQL high error rate in WAL files archiver and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' 1065 1066 - alert: PostgreSQLTableNotAutoVacuumed 1067 expr: | 1068 (pg_stat_user_tables_last_autovacuum > 0) 1069 and 1070 (time() - pg_stat_user_tables_last_autovacuum) 1071 > 24 * 60 * 60 * 10 1072 for: 0m 1073 labels: 1074 severity: warning 1075 annotations: 1076 summary: 'PostgreSQL table in database has not been auto vacuumed for 10 days' 1077 description: 'Table {{ $labels.relname }} in database has not been auto vacuumed for 10 days. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' 1078 1079 - alert: PostgreSQLTableNotAutoAnalyzed 1080 expr: | 1081 (pg_stat_user_tables_last_autoanalyze > 0) 1082 and 1083 (time() - pg_stat_user_tables_last_autoanalyze) 1084 > 24 * 60 * 60 * 10 1085 for: 0m 1086 labels: 1087 severity: warning 1088 annotations: 1089 summary: 'PostgreSQL table in database has not been auto analyzed for 10 days' 1090 description: 'Table {{ $labels.relname }} in database has not been auto analyzed for 10 days. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' 1091 1092 - alert: PostgreSQLTableTooManyDeadTuples 1093 expr: | 1094 (pg_stat_user_tables_n_dead_tup > 10000) 1095 / 1096 (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) 1097 >= 0.1 1098 for: 2m 1099 labels: 1100 severity: warning 1101 annotations: 1102 summary: 'PostgreSQL table in database has too many dead tuples (> 10%)' 1103 description: 'Table {{ $labels.relname }} in database dead tuples is too large and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' 1104 1105 redis_alert_rules.yml: | 1106 groups: 1107 - name: RedisExporter 1108 rules: 1109 - alert: RedisDown 1110 expr: 'redis_up == 0' 1111 for: 5m 1112 labels: 1113 severity: critical 1114 annotations: 1115 summary: 'Redis is down' 1116 description: 'Redis is down. (instance: {{ $labels.pod }})' 1117 1118 - alert: RedisCPUHigh 1119 expr: '(rate(redis_cpu_sys_seconds_total[1m]) + rate(redis_cpu_user_seconds_total[1m])) * 100 > 80' 1120 for: 2m 1121 labels: 1122 severity: warning 1123 annotations: 1124 summary: 'Out of CPU (> 80%)' 1125 description: 'Redis is running out of CPU and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' 1126 1127 - alert: RedisMemoryHigh 1128 expr: '(redis_memory_max_bytes == 0 or redis_memory_used_bytes * 100 / redis_memory_max_bytes) > 90' 1129 for: 5m 1130 labels: 1131 severity: warning 1132 annotations: 1133 summary: 'Out of memory (> 90%)' 1134 description: 'Redis is running out of memory and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' 1135 1136 - alert: RedisTooManyConnections 1137 expr: 'redis_connected_clients * 100 / redis_config_maxclients > 80' 1138 for: 1m 1139 labels: 1140 severity: warning 1141 annotations: 1142 summary: 'Redis has too many connections (> 80%)' 1143 description: 'Redis has too many connections and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' 1144 1145 - alert: RedisRejectedConnections 1146 expr: 'increase(redis_rejected_connections_total[1m]) > 0' 1147 for: 5m 1148 labels: 1149 severity: error 1150 annotations: 1151 summary: 'Redis has rejected connections' 1152 description: '{{ $value | printf "%.2f" }} connections to Redis has been rejected. (instance: {{ $labels.pod }})' 1153 1154 - alert: RedisKeyEviction 1155 expr: 'increase(redis_evicted_keys_total[5m]) > 0' 1156 for: 1s 1157 labels: 1158 severity: error 1159 annotations: 1160 summary: 'Redis has evicted keys' 1161 description: 'Redis has evicted keys in the last 5 minutes and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }})' 1162 1163 - alert: RedisMissingMaster 1164 expr: 'count by (app_kubernetes_io_instance) (redis_instance_info{role="master"}) < 1' 1165 for: 30s 1166 labels: 1167 severity: critical 1168 annotations: 1169 summary: 'Redis missing master' 1170 description: 'Redis cluster has no node marked as master.' 1171 1172 - alert: RedisDisconnectedSlaves 1173 expr: 'count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1' 1174 for: 0m 1175 labels: 1176 severity: critical 1177 annotations: 1178 summary: 'Redis disconnected slaves' 1179 description: 'Redis not replicating for all slaves. Consider reviewing the redis replication status. (instance: {{ $labels.pod }})' 1180 1181 - alert: RedisReplicationBroken 1182 expr: 'delta(redis_connected_slaves[1m]) < 0' 1183 for: 0m 1184 labels: 1185 severity: critical 1186 annotations: 1187 summary: 'Redis replication broken' 1188 description: 'Redis instance lost a slave. (instance: {{ $labels.pod }})' 1189 1190 mongodb_alert_rules.yml: |- 1191 groups: 1192 - name: MongodbExporter 1193 rules: 1194 - alert: MongodbDown 1195 expr: 'max_over_time(mongodb_up[1m]) == 0' 1196 for: 0m 1197 labels: 1198 severity: critical 1199 annotations: 1200 summary: 'MongoDB is Down' 1201 description: 'MongoDB instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' 1202 1203 - alert: MongodbRestarted 1204 expr: 'mongodb_instance_uptime_seconds < 60' 1205 for: 0m 1206 labels: 1207 severity: info 1208 annotations: 1209 summary: 'Mongodb has just been restarted (< 60s)' 1210 description: 'Mongodb has just been restarted {{ $value | printf "%.1f" }} seconds ago\n LABELS = {{ $labels }}' 1211 1212 - alert: MongodbReplicaMemberUnhealthy 1213 expr: 'max_over_time(mongodb_rs_members_health[1m]) == 0' 1214 for: 0m 1215 labels: 1216 severity: critical 1217 annotations: 1218 summary: 'Mongodb replica member is unhealthy' 1219 description: 'MongoDB replica member is not healthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' 1220 1221 - alert: MongodbReplicationLag 1222 expr: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (pod) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10' 1223 for: 0m 1224 labels: 1225 severity: critical 1226 annotations: 1227 summary: 'MongoDB replication lag (> 10s)' 1228 description: 'Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' 1229 1230 - alert: MongodbReplicationHeadroom 1231 expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (pod) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0' 1232 for: 0m 1233 labels: 1234 severity: critical 1235 annotations: 1236 summary: 'MongoDB replication headroom (< 0)' 1237 description: 'MongoDB replication headroom is <= 0\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' 1238 1239 - alert: MongodbNumberCursorsOpen 1240 expr: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000' 1241 for: 2m 1242 labels: 1243 severity: warning 1244 annotations: 1245 summary: 'MongoDB opened cursors num (> 10k)' 1246 description: 'Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' 1247 1248 - alert: MongodbCursorsTimeouts 1249 expr: 'increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100' 1250 for: 2m 1251 labels: 1252 severity: warning 1253 annotations: 1254 summary: 'MongoDB cursors timeouts (>100/minute)' 1255 description: 'Too many cursors are timing out (> 100/minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' 1256 1257 - alert: MongodbTooManyConnections 1258 expr: 'avg by(pod) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(pod) (sum (mongodb_ss_connections) by(pod)) * 100 > 80' 1259 for: 2m 1260 labels: 1261 severity: warning 1262 annotations: 1263 summary: 'MongoDB too many connections (> 80%)' 1264 description: 'Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' 1265 1266 - alert: MongodbVirtualMemoryUsage 1267 expr: '(sum(mongodb_ss_mem_virtual) BY (pod) / sum(mongodb_ss_mem_resident) BY (pod)) > 100' 1268 for: 2m 1269 labels: 1270 severity: warning 1271 annotations: 1272 summary: MongoDB virtual memory usage high 1273 description: "High memory usage: the quotient of (mem_virtual / mem_resident) is more than 100\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 1274 1275 kafka_alert_rules.yaml: |- 1276 group: 1277 - name: KafkaExporter 1278 rules: 1279 - alert: KafkaTopicsReplicas 1280 expr: 'sum(kafka_topic_partition_in_sync_replica) by (topic) < 3' 1281 for: 0m 1282 labels: 1283 severity: critical 1284 annotations: 1285 summary: 'Kafka topics replicas (instance {{ $labels.app_kubernetes_io_instance }})' 1286 description: 'Kafka topic in-sync partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' 1287 - alert: KafkaConsumersGroup 1288 expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 50' 1289 for: 1m 1290 labels: 1291 severity: critical 1292 annotations: 1293 summary: 'Kafka consumers group (instance {{ $labels.app_kubernetes_io_instance }})' 1294 description: 'Kafka consumers group\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' 1295 - alert: KafkaBrokerDown 1296 expr: 'kafka_brokers < 3' 1297 for: 0m 1298 labels: 1299 severity: critical 1300 annotations: 1301 Summary: 'Kafka broker *{{ $labels.app_kubernetes_io_instance }}* alert status' 1302 description: 'One of the Kafka broker *{{ $labels.app_kubernetes_io_instance }}* is down.' 1303 1304 serverFiles: 1305 prometheus.yml: 1306 rule_files: 1307 - /etc/config/recording_rules.yml 1308 - /etc/config/alerting_rules.yml 1309 - /etc/config/kubelet_alert_rules.yml 1310 - /etc/config/mysql_alert_rules.yml 1311 - /etc/config/postgresql_alert_rules.yml 1312 - /etc/config/redis_alert_rules.yml 1313 - /etc/config/kafka_alert_rules.yml 1314 - /etc/config/mongodb_alert_rules.yml 1315 1316 scrape_configs: 1317 - job_name: prometheus 1318 static_configs: 1319 - targets: 1320 - localhost:9090 1321 1322 # Scrape config for kubeblocks managed service endpoints. 1323 # 1324 # The relabeling allows the actual service scrape endpoint to be configured 1325 # via the following annotations: 1326 # 1327 # * `monitor.kubeblocks.io/scrape`: Only scrape services that have a value of 1328 # `true`. 1329 # * `monitor.kubeblocks.io/scheme`: If the metrics endpoint is secured then you will need 1330 # to set this to `https` & most likely set the `tls_config` of the scrape config. 1331 # * `monitor.kubeblocks.io/path`: If the metrics path is not `/metrics` override this. 1332 # * `monitor.kubeblocks.io/port`: If the metrics are exposed on a different port to the 1333 # service then set this appropriately. 1334 # * `monitor.kubeblocks.io/param_<parameter>`: If the metrics endpoint uses parameters 1335 # then you can set any parameter 1336 - job_name: 'kubeblocks-service' 1337 honor_labels: true 1338 1339 kubernetes_sd_configs: 1340 - role: endpoints 1341 1342 relabel_configs: 1343 - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_managed_by] 1344 action: keep 1345 regex: kubeblocks 1346 - source_labels: [__meta_kubernetes_service_label_monitor_kubeblocks_io_managed_by] 1347 action: drop 1348 regex: agamotto 1349 - source_labels: [__meta_kubernetes_service_annotation_monitor_kubeblocks_io_scrape] 1350 action: keep 1351 regex: true 1352 - source_labels: [__meta_kubernetes_service_annotation_monitor_kubeblocks_io_scheme] 1353 action: replace 1354 target_label: __scheme__ 1355 regex: (https?) 1356 - source_labels: [__meta_kubernetes_service_annotation_monitor_kubeblocks_io_path] 1357 action: replace 1358 target_label: __metrics_path__ 1359 regex: (.+) 1360 - source_labels: [__address__, __meta_kubernetes_service_annotation_monitor_kubeblocks_io_port] 1361 action: replace 1362 target_label: __address__ 1363 regex: (.+?)(?::\d+)?;(\d+) 1364 replacement: $1:$2 1365 - action: labelmap 1366 regex: __meta_kubernetes_service_annotation_monitor_kubeblocks_io_param_(.+) 1367 replacement: __param_$1 1368 - action: labelmap 1369 regex: __meta_kubernetes_service_label_(.+) 1370 - source_labels: [__meta_kubernetes_namespace] 1371 action: replace 1372 target_label: namespace 1373 - source_labels: [__meta_kubernetes_service_name] 1374 action: replace 1375 target_label: service 1376 - source_labels: [__meta_kubernetes_pod_node_name] 1377 action: replace 1378 target_label: node 1379 - source_labels: [__meta_kubernetes_pod_name] 1380 action: replace 1381 target_label: pod 1382 - source_labels: [__meta_kubernetes_pod_phase] 1383 regex: Pending|Succeeded|Failed|Completed 1384 action: drop 1385 1386 - job_name: 'kubeblocks-agamotto' 1387 honor_labels: true 1388 1389 kubernetes_sd_configs: 1390 - role: endpoints 1391 1392 relabel_configs: 1393 - source_labels: [__meta_kubernetes_service_label_monitor_kubeblocks_io_managed_by] 1394 action: keep 1395 regex: agamotto 1396 - source_labels: [__meta_kubernetes_service_annotation_monitor_kubeblocks_io_scrape] 1397 action: keep 1398 regex: true 1399 - source_labels: [__meta_kubernetes_service_annotation_monitor_kubeblocks_io_scheme] 1400 action: replace 1401 target_label: __scheme__ 1402 regex: (https?) 1403 - source_labels: [__meta_kubernetes_service_annotation_monitor_kubeblocks_io_path] 1404 action: replace 1405 target_label: __metrics_path__ 1406 regex: (.+) 1407 - source_labels: [__address__, __meta_kubernetes_service_annotation_monitor_kubeblocks_io_port] 1408 action: replace 1409 target_label: __address__ 1410 regex: (.+?)(?::\d+)?;(\d+) 1411 replacement: $1:$2 1412 - action: labelmap 1413 regex: __meta_kubernetes_service_annotation_monitor_kubeblocks_io_param_(.+) 1414 replacement: __param_$1 1415 - source_labels: [__meta_kubernetes_pod_phase] 1416 regex: Pending|Succeeded|Failed|Completed 1417 action: drop 1418 1419 pushgateway: 1420 ## If false, pushgateway will not be installed 1421 ## 1422 enabled: false 1423 1424 ## loki settings for kubeblocks 1425 loki: 1426 enabled: false 1427 singleBinary: 1428 replicas: 1 1429 monitoring: 1430 lokiCanary: 1431 enabled: false 1432 selfMonitoring: 1433 enabled: false 1434 grafanaAgent: 1435 installOperator: false 1436 dashboards: 1437 enabled: false 1438 rules: 1439 enabled: false 1440 serviceMonitor: 1441 enabled: false 1442 test: 1443 enabled: false 1444 loki: 1445 auth_enabled: false 1446 commonConfig: 1447 replication_factor: 1 1448 storage: 1449 type: filesystem 1450 podSecurityContext: 1451 runAsNonRoot: false 1452 runAsUser: 0 1453 limits_config: 1454 max_query_lookback: 72h 1455 retention_period: 72h 1456 compactor: 1457 working_directory: /var/loki/retention 1458 shared_store: filesystem 1459 compaction_interval: 10m 1460 retention_enabled: true 1461 retention_delete_delay: 2h 1462 retention_delete_worker_count: 150 1463 delete_request_cancel_period: 2h 1464 1465 1466 grafana: 1467 ## If false, grafana sub-chart will not be installed 1468 ## 1469 enabled: false 1470 1471 rbac: 1472 pspEnabled: false 1473 1474 replicas: 1 1475 1476 image: 1477 repository: infracreate-registry.cn-zhangjiakou.cr.aliyuncs.com/apecloud/grafana 1478 # Overrides the Grafana image tag whose default is the chart appVersion 1479 tag: 9.2.4 1480 1481 ## Grafana server resource requests and limits 1482 ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ 1483 ## 1484 resources: {} 1485 # limits: 1486 # cpu: 100m 1487 # memory: 128Mi 1488 # requests: 1489 # cpu: 100m 1490 # memory: 128Mi 1491 1492 ## Node tolerations for grafana scheduling to nodes with taints 1493 ## Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ 1494 ## 1495 tolerations: 1496 - key: kb-controller 1497 operator: Equal 1498 value: "true" 1499 effect: NoSchedule 1500 1501 affinity: 1502 nodeAffinity: 1503 preferredDuringSchedulingIgnoredDuringExecution: 1504 - weight: 100 1505 preference: 1506 matchExpressions: 1507 - key: kb-controller 1508 operator: In 1509 values: 1510 - "true" 1511 1512 ## Timezone for the default dashboards 1513 ## Other options are: browser or a specific timezone, i.e. Europe/Luxembourg 1514 ## 1515 defaultDashboardsTimezone: 1516 1517 adminUser: admin 1518 adminPassword: kubeblocks 1519 1520 sidecar: 1521 image: 1522 repository: infracreate-registry.cn-zhangjiakou.cr.aliyuncs.com/apecloud/k8s-sidecar 1523 tag: 1.19.2 1524 1525 dashboards: 1526 enabled: true 1527 label: grafana_dashboard 1528 labelValue: "1" 1529 searchNamespace: ALL 1530 resource: configmap 1531 1532 datasources: 1533 enabled: true 1534 label: grafana_datasource 1535 labelValue: "1" 1536 searchNamespace: ALL 1537 resource: configmap 1538 1539 defaultDatasourceEnabled: true 1540 uid: prometheus 1541 1542 skipReload: false 1543 initDatasources: true 1544 1545 testFramework: 1546 enabled: false 1547 1548 grafana.ini: 1549 # Basic auth is enabled by default and works with the builtin Grafana user password authentication system and LDAP authentication integration. 1550 auth.basic: 1551 enabled: false 1552 1553 auth.anonymous: 1554 enabled: true 1555 # Hide the Grafana version text from the footer and help tooltip for unauthenticated users (default: false) 1556 hide_version: true 1557 1558 ingress: 1559 enabled: false 1560 # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName 1561 # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress 1562 # ingressClassName: nginx 1563 # Values can be templated 1564 annotations: {} 1565 # kubernetes.io/ingress.class: nginx 1566 # kubernetes.io/tls-acme: "true" 1567 labels: {} 1568 path: / 1569 1570 # pathType is only for k8s >= 1.1= 1571 pathType: Prefix 1572 1573 hosts: 1574 - chart-example.local 1575 ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. 1576 extraPaths: [] 1577 # - path: /* 1578 # backend: 1579 # serviceName: ssl-redirect 1580 # servicePort: use-annotation 1581 ## Or for k8s > 1.19 1582 # - path: /* 1583 # pathType: Prefix 1584 # backend: 1585 # service: 1586 # name: ssl-redirect 1587 # port: 1588 # name: use-annotation 1589 1590 1591 tls: [] 1592 # - secretName: chart-example-tls 1593 # hosts: 1594 # - chart-example.local 1595 1596 ## Expose the grafana service to be accessed from outside the cluster (LoadBalancer service). 1597 ## or access it from within the cluster (ClusterIP service). Set the service type and the port to serve it. 1598 ## ref: http://kubernetes.io/docs/user-guide/services/ 1599 ## 1600 service: 1601 enabled: true 1602 type: ClusterIP 1603 port: 80 1604 targetPort: 3000 1605 # targetPort: 4181 To be used with a proxy extraContainer 1606 ## Service annotations. Can be templated. 1607 annotations: {} 1608 labels: {} 1609 portName: service 1610 # Adds the appProtocol field to the service. This allows to work with istio protocol selection. Ex: "http" or "tcp" 1611 appProtocol: "" 1612 1613 1614 ### snapshot-controller settings 1615 ### ref: https://artifacthub.io/packages/helm/piraeus-charts/snapshot-controller#configuration 1616 ### 1617 snapshot-controller: 1618 ## @param snapshot-controller.enabled -- Enable snapshot-controller chart. 1619 ## 1620 enabled: true 1621 ## @param snapshot-controller.replicaCount -- Number of replicas to deploy. 1622 ## 1623 replicaCount: 1 1624 ## snapshot-controller image setting, easy access for CN users. 1625 ## @param snapshot-controller.image.repository -- Repository to pull the image from. 1626 ## 1627 image: 1628 repository: infracreate-registry.cn-zhangjiakou.cr.aliyuncs.com/apecloud/snapshot-controller 1629 tag: v6.2.1 1630 1631 tolerations: 1632 - key: kb-controller 1633 operator: Equal 1634 value: "true" 1635 effect: NoSchedule 1636 1637 volumeSnapshotClasses: 1638 - name: default-vsc 1639 driver: hostpath.csi.k8s.io 1640 deletionPolicy: Delete 1641 1642 affinity: 1643 nodeAffinity: 1644 preferredDuringSchedulingIgnoredDuringExecution: 1645 - weight: 100 1646 preference: 1647 matchExpressions: 1648 - key: kb-controller 1649 operator: In 1650 values: 1651 - "true" 1652 1653 kubeblocks-csi-driver: 1654 enabled: false 1655 1656 1657 cloudProvider: 1658 ## cloudProvider secret settings 1659 ## @param cloudProvider.accessKey -- S3 Access Key. 1660 ## @param cloudProvider.secretKey -- S3 Secret Key. 1661 ## @param cloudProvider.region -- S3 region. 1662 ## @param cloudProvider.cloud -- cloud name: [aws,aliyun]. 1663 ## @param cloudProvider.bucket -- S3 Bucket. 1664 accessKey: "" 1665 secretKey: "" 1666 region: "" 1667 name: "" 1668 bucket: "" 1669 1670 ## csi-s3 settings 1671 ## ref: https://artifacthub.io/packages/helm/cloudve/csi-s3#configuration 1672 ## 1673 csi-s3: 1674 ## @param csi-s3.enabled -- Enable csi-s3 chart. 1675 ## 1676 enabled: false 1677 1678 alertmanager-webhook-adaptor: 1679 ## Linkage with prometheus.enabled 1680 ## 1681 # enabled: false 1682 1683 ## Webhook-Adaptor container image 1684 ## 1685 image: 1686 registry: infracreate-registry.cn-zhangjiakou.cr.aliyuncs.com 1687 1688 affinity: 1689 nodeAffinity: 1690 preferredDuringSchedulingIgnoredDuringExecution: 1691 - weight: 100 1692 preference: 1693 matchExpressions: 1694 - key: kb-controller 1695 operator: In 1696 values: 1697 - "true" 1698 1699 ## ConfigMap override where fullname is {{.Release.Name}}-{{.Values.configMapOverrideName}} 1700 ## 1701 configMapOverrideName: "config" 1702 1703 ## Webhook-Adaptor ConfigMap Entries 1704 configFiles: 1705 config.yaml: {} 1706 1707 csi-hostpath-driver: 1708 ## @param csi-hostpath-driver.enabled -- Enable csi-hostpath-driver chart. 1709 ## 1710 enabled: false 1711 ## csi-hostpath-driver storageClass setting 1712 ## @param csi-hostpath-driver.storageClass.create -- Specifies whether the storage class should be created. 1713 ## @param csi-hostpath-driver.storageClass.default -- Specifies whether the storage class should be default after created. 1714 ## 1715 storageClass: 1716 create: true 1717 default: true 1718 1719 aws-load-balancer-controller: 1720 clusterName: "" 1721 enabled: false 1722 replicaCount: 1 1723 tolerations: 1724 - key: kb-controller 1725 operator: Equal 1726 value: "true" 1727 effect: NoSchedule 1728 serviceAccount: 1729 create: true 1730 name: kubeblocks-service-account-aws-load-balancer-controller 1731 affinity: 1732 nodeAffinity: 1733 preferredDuringSchedulingIgnoredDuringExecution: 1734 - weight: 100 1735 preference: 1736 matchExpressions: 1737 - key: kb-controller 1738 operator: In 1739 values: 1740 - "true" 1741 1742 ## k8s cluster feature gates, ref: https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/ 1743 enabledAlphaFeatureGates: 1744 ## @param enabledAlphaFeatureGates.recoverVolumeExpansionFailure -- Specifies whether feature gates RecoverVolumeExpansionFailure is enabled in k8s cluster. 1745 ## 1746 recoverVolumeExpansionFailure: false 1747 1748 1749 agamotto: 1750 enabled: false 1751 image: 1752 registry: infracreate-registry.cn-zhangjiakou.cr.aliyuncs.com 1753 1754 1755 provider: "" # cloud be "aws","gcp","aliyun","tencentCloud", "huaweiCloud", "azure" 1756 validProviders: 1757 - "aws" 1758 - "gcp" 1759 - "aliyun" 1760 - "tencentCloud" 1761 - "huaweiCloud" 1762 - "azure" 1763 - "" 1764 ## @section KubeBlocks default storageClass Parameters for cloud provider. 1765 storageClass: 1766 ## @param storageClass.name -- Specifies the name of the default storage class. 1767 ## If name is not specified and KubeBlocks deployed in a cloud, a default name will be generated. 1768 ## 1769 name: "" 1770 ## @param storageClass.create -- Specifies whether the storage class should be created. If storageClass.name is not 1771 ## specified or generated, this value will be ignored. 1772 ## 1773 create: true 1774 mountOptions: 1775 - noatime 1776 - nobarrier 1777 provider: 1778 aws: 1779 volumeType: gp3 1780 fsType: xfs 1781 gcp: 1782 volumeType: pd-balanced 1783 fsType: xfs 1784 aliyun: 1785 volumeType: cloud_essd 1786 fsType: xfs 1787 azure: 1788 volumeType: managed 1789 fsType: xfs 1790 tencentCloud: 1791 volumeType: CLOUD_SSD 1792 huaweiCloud: # Huawei Cloud 1793 volumeType: SSD 1794 fsType: ext4 1795 1796 external-dns: 1797 enabled: false 1798 domain: kubeblocks.io 1799 tolerations: 1800 - key: kb-controller 1801 operator: Equal 1802 value: "true" 1803 effect: NoSchedule 1804 1805 developMode: false