k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/testing/load/modules/network-policy/net-policy-metrics.yaml (about)

     1  # Valid actions: "start", "gather"
     2  {{$action := .action}}
     3  {{$usePolicyCreationMetrics := DefaultParam .usePolicyCreationMetrics true}}
     4  {{$usePodCreationMetrics := DefaultParam .usePodCreationMetrics true}}
     5  {{$useCiliumMetrics := DefaultParam .useCiliumMetrics true}}
     6  
     7  # CL2 params
     8  # Negative default values are used to turn thresholds off if not overridden. Thresholds are only enabled with values of zero or higher.
     9  {{$NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS -1}}
    10  {{$NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS -1}}
    11  {{$NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS -1}}
    12  {{$CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD := DefaultParam .CL2_CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD 0}}
    13  {{$CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD := DefaultParam .CL2_CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD 0.01}}
    14  {{$CILIUM_POLICY_REGEN_TIME_99_THRESHOLD := DefaultParam .CL2_CILIUM_POLICY_REGEN_TIME_99_THRESHOLD -1}}
    15  {{$CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD := DefaultParam .CL2_CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD -1}}
    16  
    17  steps:
    18  - name: "{{$action}}ing network policy metrics"
    19    measurements:
    20    - Identifier: NetworkPolicyEnforcementLatency
    21      Method: GenericPrometheusQuery
    22      Params:
    23        action: {{$action}}
    24        metricName: "Network Policy Enforcement Latency"
    25        metricVersion: v1
    26        unit: s
    27        queries:
    28        # Network policy enforcement metrics gathered from the test clients.
    29        {{if $usePolicyCreationMetrics}}
    30          - name: PolicyCreation - TargetCount
    31            query: sum(policy_enforcement_latency_policy_creation_seconds_count)
    32          - name: PolicyCreation - Perc50
    33            query: histogram_quantile(0.5, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le))
    34          - name: PolicyCreation - Perc90
    35            query: histogram_quantile(0.9, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le))
    36          - name: PolicyCreation - Perc95
    37            query: histogram_quantile(0.95, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le))
    38          - name: PolicyCreation - Perc99
    39            query: histogram_quantile(0.99, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le))
    40          {{if ge $NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS 0}}
    41            threshold: {{$NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS}}
    42          {{end}}
    43        {{end}}
    44        {{if $usePodCreationMetrics}}
    45          - name: PodCreation - TargetCount
    46            query: sum(pod_creation_reachability_latency_seconds_count)
    47          - name: PodCreation - Perc50
    48            query: histogram_quantile(0.5, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le))
    49          - name: PodCreation - Perc90
    50            query: histogram_quantile(0.9, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le))
    51          - name: PodCreation - Perc95
    52            query: histogram_quantile(0.95, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le))
    53          - name: PodCreation - Perc99
    54            query: histogram_quantile(0.99, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le))
    55          {{if ge $NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS 0}}
    56            threshold: {{$NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS}}
    57          {{end}}
    58          - name: PodIpAssignedLatency - TargetCount
    59            query: sum(pod_ip_address_assigned_latency_seconds_count)
    60          - name: PodIpAssignedLatency - Perc50
    61            query: histogram_quantile(0.50, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le))
    62          - name: PodIpAssignedLatency - Perc90
    63            query: histogram_quantile(0.90, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le))
    64          - name: PodIpAssignedLatency - Perc95
    65            query: histogram_quantile(0.95, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le))
    66          - name: PodIpAssignedLatency - Perc99
    67            query: histogram_quantile(0.99, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le))
    68          {{if ge $NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS 0}}
    69            threshold: {{$NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS}}
    70          {{end}}
    71        {{end}}
    72  
    73    {{if $useCiliumMetrics}}
    74    - Identifier: NetworkPolicyMetrics
    75      Method: GenericPrometheusQuery
    76      Params:
    77        action: {{$action}}
    78        metricName: "Network Policy Performance"
    79        metricVersion: v1
    80        unit: s
    81        queries:
    82          # Cilium agent metrics that are related to network policies.
    83          - name: Number of times a policy import has failed
    84            # To be replaced with the new Cilium metric that counts all policy changes, not just import errors.
    85            # With that, this can be a percentage of failed imports.
    86            # https://github.com/cilium/cilium/pull/23349
    87            query: sum(cilium_policy_import_errors_total)
    88            threshold: {{$CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD}}
    89          - name: Failed endpoint regenerations percentage
    90            query: sum(cilium_endpoint_regenerations_total{outcome="fail"}) / sum(cilium_endpoint_regenerations_total) * 100
    91            threshold: {{$CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD}}
    92          - name: Policy regeneration time - Perc50
    93            query: histogram_quantile(0.50, sum(cilium_policy_regeneration_time_stats_seconds_bucket{scope="total"}) by (le))
    94          - name: Policy regeneration time - Perc99
    95            query: histogram_quantile(0.99, sum(cilium_policy_regeneration_time_stats_seconds_bucket{scope="total"}) by (le))
    96          {{if ge $CILIUM_POLICY_REGEN_TIME_99_THRESHOLD 0}}
    97            threshold: {{$CILIUM_POLICY_REGEN_TIME_99_THRESHOLD}}
    98          {{end}}
    99          - name: Time between a policy change and it being fully deployed into the datapath - Perc50
   100            query: histogram_quantile(0.50, sum(cilium_policy_implementation_delay_bucket) by (le))
   101          - name: Time between a policy change and it being fully deployed into the datapath - Perc99
   102            query: histogram_quantile(0.99, sum(cilium_policy_implementation_delay_bucket) by (le))
   103          - name: Latency of policy update trigger - Perc50
   104            query: histogram_quantile(0.50, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="latency"}) by (le))
   105          - name: Latency of policy update trigger - Perc99
   106            query: histogram_quantile(0.99, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="latency"}) by (le))
   107          - name: Duration of policy update trigger - Perc50
   108            query: histogram_quantile(0.50, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="duration"}) by (le))
   109          - name: Duration of policy update trigger - Perc99
   110            query: histogram_quantile(0.99, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="duration"}) by (le))
   111          - name: Endpoint regeneration latency - Perc50
   112            query: histogram_quantile(0.50, sum(cilium_endpoint_regeneration_time_stats_seconds_bucket{scope="total"}) by (le))
   113          - name: Endpoint regeneration latency - Perc99
   114            query: histogram_quantile(0.99, sum(cilium_endpoint_regeneration_time_stats_seconds_bucket{scope="total"}) by (le))
   115          {{if ge $CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD 0}}
   116            threshold: {{$CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD}}
   117          {{end}}
   118          - name: Number of policies currently loaded
   119            query: avg(cilium_policy)
   120          - name: Number of endpoints labeled by policy enforcement status
   121            query: sum(cilium_policy_endpoint_enforcement_status)
   122    {{end}}