k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/testing/load/modules/measurements.yaml (about)

     1  ## Measurement module defines test scoped measurement.
     2  
     3  ## Input params
     4  # Valid actions: "start", "gather"
     5  {{$action := .action}}
     6  
     7  ## Feature-gates and configs:
     8  {{$ALLOWED_SLOW_API_CALLS := DefaultParam .CL2_ALLOWED_SLOW_API_CALLS 0}}
     9  {{$API_AVAILABILITY_PERCENTAGE_THRESHOLD := DefaultParam .CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD 99.5}}
    10  {{$CLUSTER_OOMS_IGNORED_PROCESSES := DefaultParam .CL2_CLUSTER_OOMS_IGNORED_PROCESSES ""}}
    11  {{$CUSTOM_API_CALL_THRESHOLDS := DefaultParam .CUSTOM_API_CALL_THRESHOLDS ""}}
    12  {{$ENABLE_API_AVAILABILITY_MEASUREMENT := DefaultParam .CL2_ENABLE_API_AVAILABILITY_MEASUREMENT false}}
    13  {{$API_AVAILABILITY_MEASUREMENT_IPS_CONFIGURED := DefaultParam .CL2_API_AVAILABILITY_MEASUREMENT_IPS_CONFIGURED false}}
    14  {{$API_AVAILABILITY_MEASUREMENT_USE_INTERNAL_IPS := DefaultParam .CL2_API_AVAILABILITY_MEASUREMENT_USE_INTERNAL_IPS false}}
    15  {{$API_AVAILABILITY_MEASUREMENT_USE_PUBLIC_IPS := DefaultParam .CL2_API_AVAILABILITY_MEASUREMENT_USE_PUBLIC_IPS false}}
    16  {{$ENABLE_IN_CLUSTER_NETWORK_LATENCY := DefaultParam .CL2_ENABLE_IN_CLUSTER_NETWORK_LATENCY true}}
    17  {{$ENABLE_SLO_MEASUREMENT := DefaultParam .CL2_ENABLE_SLO_MEASUREMENT true}}
    18  {{$ENABLE_CLUSTER_OOMS_TRACKER := DefaultParam .CL2_ENABLE_CLUSTER_OOMS_TRACKER true}}
    19  {{$ENABLE_NODE_LOCAL_DNS_LATENCY := DefaultParam .CL2_ENABLE_NODE_LOCAL_DNS_LATENCY false}}
    20  {{$ENABLE_RESTART_COUNT_CHECK := DefaultParam .ENABLE_RESTART_COUNT_CHECK true}}
    21  {{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}}
    22  {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS false}}
    23  {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE true}}
    24  {{$ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT := DefaultParam .CL2_ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT false}}
    25  {{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST := DefaultParam .CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST false}}
    26  {{$CEP_PROPAGATION_DELAY_SLO_BUCKET := DefaultParam .CL2_CEP_PROPAGATION_DELAY_SLO_BUCKET 600}}
    27  {{$CEP_PROPAGATION_DELAY_SLO_PERCENTILE := DefaultParam .CL2_CEP_PROPAGATION_DELAY_SLO_PERCENTILE 95.0}}
    28  {{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST := DefaultParam .CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST false}}
    29  {{$ENABLE_CONTAINER_RESTARTS_MEASUREMENT := DefaultParam .CL2_ENABLE_CONTAINER_RESTARTS_MEASUREMENT false}}
    30  {{$ENABLE_CONTAINER_RESOURCES_MEASUREMENT := DefaultParam .CL2_ENABLE_CONTAINER_RESOURCES_MEASUREMENT false}}
    31  {{$ENABLE_TERMINATED_WATCHES_MEASUREMENT := DefaultParam .CL2_ENABLE_TERMINATED_WATCHES_MEASUREMENT false}}
    32  {{$ENABLE_QUOTAS_USAGE_MEASUREMENT := DefaultParam .CL2_ENABLE_QUOTAS_USAGE_MEASUREMENT false}}
    33  {{$ALLOWED_CONTAINER_RESTARTS := DefaultParam .CL2_ALLOWED_CONTAINER_RESTARTS 1}}
    34  {{$CUSTOM_ALLOWED_CONTAINER_RESTARTS := DefaultParam .CL2_CUSTOM_ALLOWED_CONTAINER_RESTARTS ""}}
    35  {{$NODE_LOCAL_DNS_LATENCY_THRESHOLD := DefaultParam .CL2_NODE_LOCAL_DNS_LATENCY_THRESHOLD "5s"}}
    36  {{$PROMETHEUS_SCRAPE_KUBE_PROXY := DefaultParam .PROMETHEUS_SCRAPE_KUBE_PROXY true}}
    37  {{$PROMETHEUS_SCRAPE_KUBE_STATE_METRICS := DefaultParam .PROMETHEUS_SCRAPE_KUBE_STATE_METRICS false}}
    38  {{$PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS := DefaultParam .PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS false}}
    39  {{$PROBE_MEASUREMENTS_PING_SLEEP_DURATION := DefaultParam .CL2_PROBE_MEASUREMENTS_PING_SLEEP_DURATION "1s"}}
    40  {{$RESTART_COUNT_THRESHOLD_OVERRIDES:= DefaultParam .RESTART_COUNT_THRESHOLD_OVERRIDES ""}}
    41  {{$USE_SIMPLE_LATENCY_QUERY := DefaultParam .USE_SIMPLE_LATENCY_QUERY false}}
    42  {{$ENABLE_VIOLATIONS_FOR_NETWORK_PROGRAMMING_LATENCIES := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_NETWORK_PROGRAMMING_LATENCIES false}}
    43  {{$NETWORK_PROGRAMMING_LATENCY_THRESHOLD := DefaultParam .CL2_NETWORK_PROGRAMMING_LATENCY_THRESHOLD "30s"}}
    44  {{$NETWORK_LATENCY_THRESHOLD := DefaultParam .CL2_NETWORK_LATENCY_THRESHOLD "0s"}}
    45  
    46  # Probe measurements shared parameter
    47  {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT := DefaultParam .CL2_PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT "15m"}}
    48  
    49  steps:
    50  - name: "{{$action}}ing measurements"
    51    measurements:
    52    - Identifier: APIResponsivenessPrometheus
    53      Method: APIResponsivenessPrometheus
    54      Params:
    55        action: {{$action}}
    56  {{if not $USE_SIMPLE_LATENCY_QUERY}}
    57        enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS}}
    58        allowedSlowCalls: {{$ALLOWED_SLOW_API_CALLS}}
    59        customThresholds: {{YamlQuote $CUSTOM_API_CALL_THRESHOLDS 4}}
    60  {{end}}
    61    - Identifier: APIResponsivenessPrometheusSimple
    62      Method: APIResponsivenessPrometheus
    63      Params:
    64        action: {{$action}}
    65        enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE}}
    66        useSimpleLatencyQuery: true
    67        summaryName: APIResponsivenessPrometheus_simple
    68        allowedSlowCalls: {{$ALLOWED_SLOW_API_CALLS}}
    69        customThresholds: {{YamlQuote $CUSTOM_API_CALL_THRESHOLDS 4}}
    70    - Identifier: CreatePhasePodStartupLatency
    71      Method: PodStartupLatency
    72      Params:
    73        action: {{$action}}
    74        labelSelector: group = load
    75        threshold: 1h # TODO(https://github.com/kubernetes/perf-tests/issues/1024): Ideally, this should be 5s
    76  {{if $ENABLE_IN_CLUSTER_NETWORK_LATENCY}}
    77    - Identifier: InClusterNetworkLatency
    78      Method: InClusterNetworkLatency
    79      Params:
    80        action: {{$action}}
    81        checkProbesReadyTimeout: {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT}}
    82        replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}}
    83        pingSleepDuration: {{$PROBE_MEASUREMENTS_PING_SLEEP_DURATION}}
    84        threshold: {{$NETWORK_LATENCY_THRESHOLD}}
    85  {{end}}
    86  {{if $ENABLE_NODE_LOCAL_DNS_LATENCY}}
    87    - Identifier: NodeLocalDNSLatency
    88      Method: NodeLocalDNSLatencyPrometheus
    89      Params:
    90        action: {{$action}}
    91        enableViolations: true
    92        threshold: {{$NODE_LOCAL_DNS_LATENCY_THRESHOLD}}
    93  {{end}}
    94  {{if $ENABLE_SLO_MEASUREMENT}}
    95    - Identifier: SLOMeasurement
    96      Method: SLOMeasurement
    97      Params:
    98        action: {{$action}}
    99        checkProbesReadyTimeout: {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT}}
   100        replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}}
   101  {{end}}
   102  {{if $PROMETHEUS_SCRAPE_KUBE_PROXY}}
   103    - Identifier: NetworkProgrammingLatency
   104      Method: NetworkProgrammingLatency
   105      Params:
   106        action: {{$action}}
   107        enableViolations: {{$ENABLE_VIOLATIONS_FOR_NETWORK_PROGRAMMING_LATENCIES}}
   108        threshold: {{$NETWORK_PROGRAMMING_LATENCY_THRESHOLD}}
   109    - Identifier: Kube-proxy partial iptables restore failures
   110      Method: GenericPrometheusQuery
   111      Params:
   112        action: {{$action}}
   113        metricName: KubeProxyIptablesRestoreFailures
   114        metricVersion: v1alpha1
   115        unit: failures
   116        queries:
   117        - name: Total
   118          query: sum(kubeproxy_sync_proxy_rules_iptables_partial_restore_failures_total)
   119          requireSamples: false # It is a feature gate and may not be enabled
   120          threshold: 0
   121  {{end}}
   122  {{if $PROMETHEUS_SCRAPE_KUBE_STATE_METRICS}}
   123    - Identifier: KubeStateMetricsLatency
   124      Method: KubeStateMetricsLatency
   125      Params:
   126        action: {{$action}}
   127  {{end}}
   128  {{if $PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS}}
   129    - Identifier: MetricsServerPrometheus
   130      Method: MetricsServerPrometheus
   131      Params:
   132        action: {{$action}}
   133  {{end}}
   134  
   135  {{if $ENABLE_API_AVAILABILITY_MEASUREMENT}}
   136    - Identifier: APIAvailability
   137      Method: APIAvailability
   138      Params:
   139        action: {{$action}}
   140        pollFrequency: "5s"
   141        hostPollTimeoutSeconds: 5
   142        threshold: {{$API_AVAILABILITY_PERCENTAGE_THRESHOLD}}
   143        {{if $API_AVAILABILITY_MEASUREMENT_IPS_CONFIGURED}}
   144        useHostInternalIPs: {{$API_AVAILABILITY_MEASUREMENT_USE_INTERNAL_IPS}}
   145        useHostPublicIPs: {{$API_AVAILABILITY_MEASUREMENT_USE_PUBLIC_IPS}}
   146        {{end}}
   147  {{end}}
   148  {{if $ENABLE_CONTAINER_RESTARTS_MEASUREMENT}}
   149    - Identifier: ContainerRestarts
   150      Method: ContainerRestarts
   151      Params:
   152        action: {{$action}}
   153        enableViolations: true
   154        defaultAllowedRestarts: {{$ALLOWED_CONTAINER_RESTARTS}}
   155        customAllowedRestarts: {{YamlQuote $CUSTOM_ALLOWED_CONTAINER_RESTARTS 4}}
   156  {{end}}
   157  {{if $ENABLE_CONTAINER_RESOURCES_MEASUREMENT}}
   158    - Identifier: ContainerCPU
   159      Method: GenericPrometheusQuery
   160      Params:
   161        action: {{$action}}
   162        metricName: Container CPU
   163        metricVersion: v1
   164        unit: cores
   165        dimensions:
   166        - container
   167        queries:
   168        - name: Perc99
   169          query: quantile_over_time(0.99, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:])
   170        - name: Perc90
   171          query: quantile_over_time(0.90, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:])
   172        - name: Perc50
   173          query: quantile_over_time(0.50, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:])
   174    - Identifier: ContainerMemory
   175      Method: GenericPrometheusQuery
   176      Params:
   177        action: {{$action}}
   178        metricName: Container Memory
   179        metricVersion: v1
   180        unit: MiB
   181        dimensions:
   182        - container
   183        queries:
   184        - name: Perc99
   185          query: quantile_over_time(0.99, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:])
   186        - name: Perc90
   187          query: quantile_over_time(0.90, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:])
   188        - name: Perc50
   189          query: quantile_over_time(0.50, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:])
   190  {{end}}
   191  {{if $ENABLE_TERMINATED_WATCHES_MEASUREMENT}}
   192    - Identifier: TerminatedWatchesMetrics
   193      Method: GenericPrometheusQuery
   194      Params:
   195        action: {{$action}}
   196        metricName: Terminated Watches
   197        metricVersion: v1
   198        dimensions:
   199        - resource
   200        queries:
   201        - name: Terminated watches
   202          query: sum(increase(apiserver_terminated_watchers_total[%v:])) by (resource)
   203    - Identifier: WatchCacheInitializations
   204      Method: GenericPrometheusQuery
   205      Params:
   206        action: {{$action}}
   207        metricName: Watch Cache Initializations
   208        metricVersion: v1
   209        dimensions:
   210        - resource
   211        queries:
   212        - name: Watch cache reinitializations
   213          query: sum(increase(apiserver_watch_cache_initializations_total[%v:])) by (resource)
   214  {{end}}
   215  {{if $ENABLE_QUOTAS_USAGE_MEASUREMENT}}
   216    - Identifier: Quotas total usage
   217      Method: GenericPrometheusQuery
   218      Params:
   219        action: {{$action}}
   220        metricName: Quota usage
   221        metricVersion: v1
   222        prometheusClient: managed
   223        unit: QPMs
   224        dimensions:
   225        - quota_metric
   226        queries:
   227        - name: perc99
   228          query: quantile_over_time(0.99, sum by (quota_metric) (irate(serviceruntime_googleapis_com:quota_rate_net_usage{monitored_resource="consumer_quota"}[1m]))[%v:]) * 60
   229        - name: max
   230          query: max_over_time(sum by (quota_metric) (irate(serviceruntime_googleapis_com:quota_rate_net_usage{monitored_resource="consumer_quota"}[1m]))[%v:]) * 60
   231  {{end}}
   232  {{if $ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT}}
   233    - Identifier: CiliumEndpointPropagationDelay
   234      Method: CiliumEndpointPropagationDelay
   235      Params:
   236        action: {{$action}}
   237        bucketSLO: {{$CEP_PROPAGATION_DELAY_SLO_BUCKET}}
   238        percentileSLO: {{$CEP_PROPAGATION_DELAY_SLO_PERCENTILE}}
   239        enableViolations: true
   240  {{end}}
   241    - Identifier: TestMetrics
   242      Method: TestMetrics
   243      Params:
   244        action: {{$action}}
   245        systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}}
   246        clusterOOMsIgnoredProcesses: {{YamlQuote $CLUSTER_OOMS_IGNORED_PROCESSES 4}}
   247        clusterOOMsTrackerEnabled: {{$ENABLE_CLUSTER_OOMS_TRACKER}}
   248        restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}}
   249        enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}}
   250  - module:
   251      path: modules/dns-performance-metrics.yaml
   252      params:
   253        action: {{$action}}
   254  
   255  {{if $ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST}}
   256  - module:
   257      path: modules/network-policy/net-policy-metrics.yaml
   258      params:
   259        action: {{$action}}
   260  {{end}}