github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/thanos/templates/alert-rule/store_gateway.yml (about)

     1  {{- /*
     2  Generated from https://github.com/thanos-io/thanos/blob/main/examples/alerts/alerts.md
     3  */ -}}
     4  {{- if and .Values.metrics.enabled (or .Values.metrics.prometheusRule.default.create .Values.metrics.prometheusRule.default.store_gateway ) .Values.storegateway.enabled ( .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" ) }}
     5  apiVersion: monitoring.coreos.com/v1
     6  kind: PrometheusRule
     7  metadata:
     8    name: {{ template "common.names.fullname" . }}-store-gateway
     9    namespace: {{ default .Release.Namespace .Values.metrics.prometheusRule.namespace | quote }}
    10    labels: {{- include "common.labels.standard" . | nindent 4 }}
    11      {{- if .Values.metrics.prometheusRule.additionalLabels }}
    12      {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }}
    13      {{- end }}
    14      {{- if .Values.commonLabels }}
    15      {{- include "common.tplvalues.render" ( dict "value" .Values.commonLabels "context" $ ) | nindent 4 }}
    16      {{- end }}
    17    {{- if .Values.commonAnnotations }}
    18    annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }}
    19    {{- end }}
    20  spec:
    21    groups:
    22    - name: thanos-store
    23      rules:
    24      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosStoreGrpcErrorRate | default false) }}
    25      - alert: ThanosStoreGrpcErrorRate
    26        annotations:
    27          {{- if .Values.commonAnnotations }}
    28          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    29          {{- end }}
    30          description: Thanos Store {{`{{`}} $labels.job {{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of requests.
    31          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate
    32          summary: Thanos Store is failing to handle qrpcd requests.
    33        expr: |
    34          (
    35            sum by (job, verrazzano_cluster) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))
    36          /
    37            sum by (job, verrazzano_cluster) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m]))
    38          * 100 > 5
    39          )
    40        for: 5m
    41        labels:
    42          severity: warning
    43          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    44          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    45          {{- end }}
    46      {{- end }}
    47      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosStoreSeriesGateLatencyHigh | default false) }}
    48      - alert: ThanosStoreSeriesGateLatencyHigh
    49        annotations:
    50          {{- if .Values.commonAnnotations }}
    51          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    52          {{- end }}
    53          description: Thanos Store {{`{{`}} $labels.job {{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for store series gate requests.
    54          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh
    55          summary: Thanos Store has high latency for store series gate requests.
    56        expr: |
    57          (
    58            histogram_quantile(0.99, sum by (job, le, verrazzano_cluster) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2
    59          and
    60            sum by (job, verrazzano_cluster) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0
    61          )
    62        for: 10m
    63        labels:
    64          severity: warning
    65          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    66          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    67          {{- end }}
    68      {{- end }}
    69      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosStoreBucketHighOperationFailures | default false) }}
    70      - alert: ThanosStoreBucketHighOperationFailures
    71        annotations:
    72          {{- if .Values.commonAnnotations }}
    73          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    74          {{- end }}
    75          description: Thanos Store {{`{{`}} $labels.job {{`}}`}} Bucket is failing to execute {{`{{`}} $value | humanize {{`}}`}}% of operations.
    76          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures
    77          summary: Thanos Store Bucket is failing to execute operations.
    78        expr: |
    79          (
    80            sum by (job, verrazzano_cluster) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m]))
    81          /
    82            sum by (job, verrazzano_cluster) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m]))
    83          * 100 > 5
    84          )
    85        for: 15m
    86        labels:
    87          severity: warning
    88          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    89          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    90          {{- end }}
    91      {{- end }}
    92      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosStoreObjstoreOperationLatencyHigh | default false) }}
    93      - alert: ThanosStoreObjstoreOperationLatencyHigh
    94        annotations:
    95          {{- if .Values.commonAnnotations }}
    96          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    97          {{- end }}
    98          description: Thanos Store {{`{{`}} $labels.job {{`}}`}} Bucket has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for the bucket operations.
    99          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh
   100          summary: Thanos Store is having high latency for bucket operations.
   101        expr: |
   102          (
   103            histogram_quantile(0.99, sum by (job, le, verrazzano_cluster) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2
   104          and
   105            sum by (job, verrazzano_cluster) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0
   106          )
   107        for: 10m
   108        labels:
   109          severity: warning
   110          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   111          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   112          {{- end }}
   113      {{- end }}
   114  {{- end }}