k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/testing/load/modules/measurements.yaml (about) 1 ## Measurement module defines test scoped measurement. 2 3 ## Input params 4 # Valid actions: "start", "gather" 5 {{$action := .action}} 6 7 ## Feature-gates and configs: 8 {{$ALLOWED_SLOW_API_CALLS := DefaultParam .CL2_ALLOWED_SLOW_API_CALLS 0}} 9 {{$API_AVAILABILITY_PERCENTAGE_THRESHOLD := DefaultParam .CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD 99.5}} 10 {{$CLUSTER_OOMS_IGNORED_PROCESSES := DefaultParam .CL2_CLUSTER_OOMS_IGNORED_PROCESSES ""}} 11 {{$CUSTOM_API_CALL_THRESHOLDS := DefaultParam .CUSTOM_API_CALL_THRESHOLDS ""}} 12 {{$ENABLE_API_AVAILABILITY_MEASUREMENT := DefaultParam .CL2_ENABLE_API_AVAILABILITY_MEASUREMENT false}} 13 {{$API_AVAILABILITY_MEASUREMENT_IPS_CONFIGURED := DefaultParam .CL2_API_AVAILABILITY_MEASUREMENT_IPS_CONFIGURED false}} 14 {{$API_AVAILABILITY_MEASUREMENT_USE_INTERNAL_IPS := DefaultParam .CL2_API_AVAILABILITY_MEASUREMENT_USE_INTERNAL_IPS false}} 15 {{$API_AVAILABILITY_MEASUREMENT_USE_PUBLIC_IPS := DefaultParam .CL2_API_AVAILABILITY_MEASUREMENT_USE_PUBLIC_IPS false}} 16 {{$ENABLE_IN_CLUSTER_NETWORK_LATENCY := DefaultParam .CL2_ENABLE_IN_CLUSTER_NETWORK_LATENCY true}} 17 {{$ENABLE_SLO_MEASUREMENT := DefaultParam .CL2_ENABLE_SLO_MEASUREMENT true}} 18 {{$ENABLE_CLUSTER_OOMS_TRACKER := DefaultParam .CL2_ENABLE_CLUSTER_OOMS_TRACKER true}} 19 {{$ENABLE_NODE_LOCAL_DNS_LATENCY := DefaultParam .CL2_ENABLE_NODE_LOCAL_DNS_LATENCY false}} 20 {{$ENABLE_RESTART_COUNT_CHECK := DefaultParam .ENABLE_RESTART_COUNT_CHECK true}} 21 {{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}} 22 {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS false}} 23 {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE true}} 24 {{$ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT := DefaultParam .CL2_ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT false}} 25 {{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST := DefaultParam .CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST false}} 26 {{$CEP_PROPAGATION_DELAY_SLO_BUCKET := DefaultParam .CL2_CEP_PROPAGATION_DELAY_SLO_BUCKET 600}} 27 {{$CEP_PROPAGATION_DELAY_SLO_PERCENTILE := DefaultParam .CL2_CEP_PROPAGATION_DELAY_SLO_PERCENTILE 95.0}} 28 {{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST := DefaultParam .CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST false}} 29 {{$ENABLE_CONTAINER_RESTARTS_MEASUREMENT := DefaultParam .CL2_ENABLE_CONTAINER_RESTARTS_MEASUREMENT false}} 30 {{$ENABLE_CONTAINER_RESOURCES_MEASUREMENT := DefaultParam .CL2_ENABLE_CONTAINER_RESOURCES_MEASUREMENT false}} 31 {{$ENABLE_TERMINATED_WATCHES_MEASUREMENT := DefaultParam .CL2_ENABLE_TERMINATED_WATCHES_MEASUREMENT false}} 32 {{$ENABLE_QUOTAS_USAGE_MEASUREMENT := DefaultParam .CL2_ENABLE_QUOTAS_USAGE_MEASUREMENT false}} 33 {{$ALLOWED_CONTAINER_RESTARTS := DefaultParam .CL2_ALLOWED_CONTAINER_RESTARTS 1}} 34 {{$CUSTOM_ALLOWED_CONTAINER_RESTARTS := DefaultParam .CL2_CUSTOM_ALLOWED_CONTAINER_RESTARTS ""}} 35 {{$NODE_LOCAL_DNS_LATENCY_THRESHOLD := DefaultParam .CL2_NODE_LOCAL_DNS_LATENCY_THRESHOLD "5s"}} 36 {{$PROMETHEUS_SCRAPE_KUBE_PROXY := DefaultParam .PROMETHEUS_SCRAPE_KUBE_PROXY true}} 37 {{$PROMETHEUS_SCRAPE_KUBE_STATE_METRICS := DefaultParam .PROMETHEUS_SCRAPE_KUBE_STATE_METRICS false}} 38 {{$PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS := DefaultParam .PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS false}} 39 {{$PROBE_MEASUREMENTS_PING_SLEEP_DURATION := DefaultParam .CL2_PROBE_MEASUREMENTS_PING_SLEEP_DURATION "1s"}} 40 {{$RESTART_COUNT_THRESHOLD_OVERRIDES:= DefaultParam .RESTART_COUNT_THRESHOLD_OVERRIDES ""}} 41 {{$USE_SIMPLE_LATENCY_QUERY := DefaultParam .USE_SIMPLE_LATENCY_QUERY false}} 42 {{$ENABLE_VIOLATIONS_FOR_NETWORK_PROGRAMMING_LATENCIES := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_NETWORK_PROGRAMMING_LATENCIES false}} 43 {{$NETWORK_PROGRAMMING_LATENCY_THRESHOLD := DefaultParam .CL2_NETWORK_PROGRAMMING_LATENCY_THRESHOLD "30s"}} 44 {{$NETWORK_LATENCY_THRESHOLD := DefaultParam .CL2_NETWORK_LATENCY_THRESHOLD "0s"}} 45 46 # Probe measurements shared parameter 47 {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT := DefaultParam .CL2_PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT "15m"}} 48 49 steps: 50 - name: "{{$action}}ing measurements" 51 measurements: 52 - Identifier: APIResponsivenessPrometheus 53 Method: APIResponsivenessPrometheus 54 Params: 55 action: {{$action}} 56 {{if not $USE_SIMPLE_LATENCY_QUERY}} 57 enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS}} 58 allowedSlowCalls: {{$ALLOWED_SLOW_API_CALLS}} 59 customThresholds: {{YamlQuote $CUSTOM_API_CALL_THRESHOLDS 4}} 60 {{end}} 61 - Identifier: APIResponsivenessPrometheusSimple 62 Method: APIResponsivenessPrometheus 63 Params: 64 action: {{$action}} 65 enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE}} 66 useSimpleLatencyQuery: true 67 summaryName: APIResponsivenessPrometheus_simple 68 allowedSlowCalls: {{$ALLOWED_SLOW_API_CALLS}} 69 customThresholds: {{YamlQuote $CUSTOM_API_CALL_THRESHOLDS 4}} 70 - Identifier: CreatePhasePodStartupLatency 71 Method: PodStartupLatency 72 Params: 73 action: {{$action}} 74 labelSelector: group = load 75 threshold: 1h # TODO(https://github.com/kubernetes/perf-tests/issues/1024): Ideally, this should be 5s 76 {{if $ENABLE_IN_CLUSTER_NETWORK_LATENCY}} 77 - Identifier: InClusterNetworkLatency 78 Method: InClusterNetworkLatency 79 Params: 80 action: {{$action}} 81 checkProbesReadyTimeout: {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT}} 82 replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} 83 pingSleepDuration: {{$PROBE_MEASUREMENTS_PING_SLEEP_DURATION}} 84 threshold: {{$NETWORK_LATENCY_THRESHOLD}} 85 {{end}} 86 {{if $ENABLE_NODE_LOCAL_DNS_LATENCY}} 87 - Identifier: NodeLocalDNSLatency 88 Method: NodeLocalDNSLatencyPrometheus 89 Params: 90 action: {{$action}} 91 enableViolations: true 92 threshold: {{$NODE_LOCAL_DNS_LATENCY_THRESHOLD}} 93 {{end}} 94 {{if $ENABLE_SLO_MEASUREMENT}} 95 - Identifier: SLOMeasurement 96 Method: SLOMeasurement 97 Params: 98 action: {{$action}} 99 checkProbesReadyTimeout: {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT}} 100 replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} 101 {{end}} 102 {{if $PROMETHEUS_SCRAPE_KUBE_PROXY}} 103 - Identifier: NetworkProgrammingLatency 104 Method: NetworkProgrammingLatency 105 Params: 106 action: {{$action}} 107 enableViolations: {{$ENABLE_VIOLATIONS_FOR_NETWORK_PROGRAMMING_LATENCIES}} 108 threshold: {{$NETWORK_PROGRAMMING_LATENCY_THRESHOLD}} 109 - Identifier: Kube-proxy partial iptables restore failures 110 Method: GenericPrometheusQuery 111 Params: 112 action: {{$action}} 113 metricName: KubeProxyIptablesRestoreFailures 114 metricVersion: v1alpha1 115 unit: failures 116 queries: 117 - name: Total 118 query: sum(kubeproxy_sync_proxy_rules_iptables_partial_restore_failures_total) 119 requireSamples: false # It is a feature gate and may not be enabled 120 threshold: 0 121 {{end}} 122 {{if $PROMETHEUS_SCRAPE_KUBE_STATE_METRICS}} 123 - Identifier: KubeStateMetricsLatency 124 Method: KubeStateMetricsLatency 125 Params: 126 action: {{$action}} 127 {{end}} 128 {{if $PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS}} 129 - Identifier: MetricsServerPrometheus 130 Method: MetricsServerPrometheus 131 Params: 132 action: {{$action}} 133 {{end}} 134 135 {{if $ENABLE_API_AVAILABILITY_MEASUREMENT}} 136 - Identifier: APIAvailability 137 Method: APIAvailability 138 Params: 139 action: {{$action}} 140 pollFrequency: "5s" 141 hostPollTimeoutSeconds: 5 142 threshold: {{$API_AVAILABILITY_PERCENTAGE_THRESHOLD}} 143 {{if $API_AVAILABILITY_MEASUREMENT_IPS_CONFIGURED}} 144 useHostInternalIPs: {{$API_AVAILABILITY_MEASUREMENT_USE_INTERNAL_IPS}} 145 useHostPublicIPs: {{$API_AVAILABILITY_MEASUREMENT_USE_PUBLIC_IPS}} 146 {{end}} 147 {{end}} 148 {{if $ENABLE_CONTAINER_RESTARTS_MEASUREMENT}} 149 - Identifier: ContainerRestarts 150 Method: ContainerRestarts 151 Params: 152 action: {{$action}} 153 enableViolations: true 154 defaultAllowedRestarts: {{$ALLOWED_CONTAINER_RESTARTS}} 155 customAllowedRestarts: {{YamlQuote $CUSTOM_ALLOWED_CONTAINER_RESTARTS 4}} 156 {{end}} 157 {{if $ENABLE_CONTAINER_RESOURCES_MEASUREMENT}} 158 - Identifier: ContainerCPU 159 Method: GenericPrometheusQuery 160 Params: 161 action: {{$action}} 162 metricName: Container CPU 163 metricVersion: v1 164 unit: cores 165 dimensions: 166 - container 167 queries: 168 - name: Perc99 169 query: quantile_over_time(0.99, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) 170 - name: Perc90 171 query: quantile_over_time(0.90, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) 172 - name: Perc50 173 query: quantile_over_time(0.50, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) 174 - Identifier: ContainerMemory 175 Method: GenericPrometheusQuery 176 Params: 177 action: {{$action}} 178 metricName: Container Memory 179 metricVersion: v1 180 unit: MiB 181 dimensions: 182 - container 183 queries: 184 - name: Perc99 185 query: quantile_over_time(0.99, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:]) 186 - name: Perc90 187 query: quantile_over_time(0.90, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:]) 188 - name: Perc50 189 query: quantile_over_time(0.50, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:]) 190 {{end}} 191 {{if $ENABLE_TERMINATED_WATCHES_MEASUREMENT}} 192 - Identifier: TerminatedWatchesMetrics 193 Method: GenericPrometheusQuery 194 Params: 195 action: {{$action}} 196 metricName: Terminated Watches 197 metricVersion: v1 198 dimensions: 199 - resource 200 queries: 201 - name: Terminated watches 202 query: sum(increase(apiserver_terminated_watchers_total[%v:])) by (resource) 203 - Identifier: WatchCacheInitializations 204 Method: GenericPrometheusQuery 205 Params: 206 action: {{$action}} 207 metricName: Watch Cache Initializations 208 metricVersion: v1 209 dimensions: 210 - resource 211 queries: 212 - name: Watch cache reinitializations 213 query: sum(increase(apiserver_watch_cache_initializations_total[%v:])) by (resource) 214 {{end}} 215 {{if $ENABLE_QUOTAS_USAGE_MEASUREMENT}} 216 - Identifier: Quotas total usage 217 Method: GenericPrometheusQuery 218 Params: 219 action: {{$action}} 220 metricName: Quota usage 221 metricVersion: v1 222 prometheusClient: managed 223 unit: QPMs 224 dimensions: 225 - quota_metric 226 queries: 227 - name: perc99 228 query: quantile_over_time(0.99, sum by (quota_metric) (irate(serviceruntime_googleapis_com:quota_rate_net_usage{monitored_resource="consumer_quota"}[1m]))[%v:]) * 60 229 - name: max 230 query: max_over_time(sum by (quota_metric) (irate(serviceruntime_googleapis_com:quota_rate_net_usage{monitored_resource="consumer_quota"}[1m]))[%v:]) * 60 231 {{end}} 232 {{if $ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT}} 233 - Identifier: CiliumEndpointPropagationDelay 234 Method: CiliumEndpointPropagationDelay 235 Params: 236 action: {{$action}} 237 bucketSLO: {{$CEP_PROPAGATION_DELAY_SLO_BUCKET}} 238 percentileSLO: {{$CEP_PROPAGATION_DELAY_SLO_PERCENTILE}} 239 enableViolations: true 240 {{end}} 241 - Identifier: TestMetrics 242 Method: TestMetrics 243 Params: 244 action: {{$action}} 245 systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} 246 clusterOOMsIgnoredProcesses: {{YamlQuote $CLUSTER_OOMS_IGNORED_PROCESSES 4}} 247 clusterOOMsTrackerEnabled: {{$ENABLE_CLUSTER_OOMS_TRACKER}} 248 restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}} 249 enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} 250 - module: 251 path: modules/dns-performance-metrics.yaml 252 params: 253 action: {{$action}} 254 255 {{if $ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST}} 256 - module: 257 path: modules/network-policy/net-policy-metrics.yaml 258 params: 259 action: {{$action}} 260 {{end}}