k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/testing/load/modules/network-policy/net-policy-metrics.yaml (about) 1 # Valid actions: "start", "gather" 2 {{$action := .action}} 3 {{$usePolicyCreationMetrics := DefaultParam .usePolicyCreationMetrics true}} 4 {{$usePodCreationMetrics := DefaultParam .usePodCreationMetrics true}} 5 {{$useCiliumMetrics := DefaultParam .useCiliumMetrics true}} 6 7 # CL2 params 8 # Negative default values are used to turn thresholds off if not overridden. Thresholds are only enabled with values of zero or higher. 9 {{$NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS -1}} 10 {{$NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS -1}} 11 {{$NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS -1}} 12 {{$CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD := DefaultParam .CL2_CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD 0}} 13 {{$CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD := DefaultParam .CL2_CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD 0.01}} 14 {{$CILIUM_POLICY_REGEN_TIME_99_THRESHOLD := DefaultParam .CL2_CILIUM_POLICY_REGEN_TIME_99_THRESHOLD -1}} 15 {{$CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD := DefaultParam .CL2_CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD -1}} 16 17 steps: 18 - name: "{{$action}}ing network policy metrics" 19 measurements: 20 - Identifier: NetworkPolicyEnforcementLatency 21 Method: GenericPrometheusQuery 22 Params: 23 action: {{$action}} 24 metricName: "Network Policy Enforcement Latency" 25 metricVersion: v1 26 unit: s 27 queries: 28 # Network policy enforcement metrics gathered from the test clients. 29 {{if $usePolicyCreationMetrics}} 30 - name: PolicyCreation - TargetCount 31 query: sum(policy_enforcement_latency_policy_creation_seconds_count) 32 - name: PolicyCreation - Perc50 33 query: histogram_quantile(0.5, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) 34 - name: PolicyCreation - Perc90 35 query: histogram_quantile(0.9, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) 36 - name: PolicyCreation - Perc95 37 query: histogram_quantile(0.95, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) 38 - name: PolicyCreation - Perc99 39 query: histogram_quantile(0.99, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) 40 {{if ge $NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS 0}} 41 threshold: {{$NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS}} 42 {{end}} 43 {{end}} 44 {{if $usePodCreationMetrics}} 45 - name: PodCreation - TargetCount 46 query: sum(pod_creation_reachability_latency_seconds_count) 47 - name: PodCreation - Perc50 48 query: histogram_quantile(0.5, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) 49 - name: PodCreation - Perc90 50 query: histogram_quantile(0.9, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) 51 - name: PodCreation - Perc95 52 query: histogram_quantile(0.95, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) 53 - name: PodCreation - Perc99 54 query: histogram_quantile(0.99, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) 55 {{if ge $NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS 0}} 56 threshold: {{$NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS}} 57 {{end}} 58 - name: PodIpAssignedLatency - TargetCount 59 query: sum(pod_ip_address_assigned_latency_seconds_count) 60 - name: PodIpAssignedLatency - Perc50 61 query: histogram_quantile(0.50, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) 62 - name: PodIpAssignedLatency - Perc90 63 query: histogram_quantile(0.90, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) 64 - name: PodIpAssignedLatency - Perc95 65 query: histogram_quantile(0.95, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) 66 - name: PodIpAssignedLatency - Perc99 67 query: histogram_quantile(0.99, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) 68 {{if ge $NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS 0}} 69 threshold: {{$NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS}} 70 {{end}} 71 {{end}} 72 73 {{if $useCiliumMetrics}} 74 - Identifier: NetworkPolicyMetrics 75 Method: GenericPrometheusQuery 76 Params: 77 action: {{$action}} 78 metricName: "Network Policy Performance" 79 metricVersion: v1 80 unit: s 81 queries: 82 # Cilium agent metrics that are related to network policies. 83 - name: Number of times a policy import has failed 84 # To be replaced with the new Cilium metric that counts all policy changes, not just import errors. 85 # With that, this can be a percentage of failed imports. 86 # https://github.com/cilium/cilium/pull/23349 87 query: sum(cilium_policy_import_errors_total) 88 threshold: {{$CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD}} 89 - name: Failed endpoint regenerations percentage 90 query: sum(cilium_endpoint_regenerations_total{outcome="fail"}) / sum(cilium_endpoint_regenerations_total) * 100 91 threshold: {{$CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD}} 92 - name: Policy regeneration time - Perc50 93 query: histogram_quantile(0.50, sum(cilium_policy_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) 94 - name: Policy regeneration time - Perc99 95 query: histogram_quantile(0.99, sum(cilium_policy_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) 96 {{if ge $CILIUM_POLICY_REGEN_TIME_99_THRESHOLD 0}} 97 threshold: {{$CILIUM_POLICY_REGEN_TIME_99_THRESHOLD}} 98 {{end}} 99 - name: Time between a policy change and it being fully deployed into the datapath - Perc50 100 query: histogram_quantile(0.50, sum(cilium_policy_implementation_delay_bucket) by (le)) 101 - name: Time between a policy change and it being fully deployed into the datapath - Perc99 102 query: histogram_quantile(0.99, sum(cilium_policy_implementation_delay_bucket) by (le)) 103 - name: Latency of policy update trigger - Perc50 104 query: histogram_quantile(0.50, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="latency"}) by (le)) 105 - name: Latency of policy update trigger - Perc99 106 query: histogram_quantile(0.99, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="latency"}) by (le)) 107 - name: Duration of policy update trigger - Perc50 108 query: histogram_quantile(0.50, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="duration"}) by (le)) 109 - name: Duration of policy update trigger - Perc99 110 query: histogram_quantile(0.99, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="duration"}) by (le)) 111 - name: Endpoint regeneration latency - Perc50 112 query: histogram_quantile(0.50, sum(cilium_endpoint_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) 113 - name: Endpoint regeneration latency - Perc99 114 query: histogram_quantile(0.99, sum(cilium_endpoint_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) 115 {{if ge $CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD 0}} 116 threshold: {{$CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD}} 117 {{end}} 118 - name: Number of policies currently loaded 119 query: avg(cilium_policy) 120 - name: Number of endpoints labeled by policy enforcement status 121 query: sum(cilium_policy_endpoint_enforcement_status) 122 {{end}}