github.com/eth-easl/loader@v0.0.0-20230908084258-8a37e1d94279/pkg/metric/scrape_kn.py

github.com/eth-easl/loader@v0.0.0-20230908084258-8a37e1d94279/pkg/metric/scrape_kn.py (about)

     1  #  MIT License
     2  #
     3  #  Copyright (c) 2023 EASL and the vHive community
     4  #
     5  #  Permission is hereby granted, free of charge, to any person obtaining a copy
     6  #  of this software and associated documentation files (the "Software"), to deal
     7  #  in the Software without restriction, including without limitation the rights
     8  #  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     9  #  copies of the Software, and to permit persons to whom the Software is
    10  #  furnished to do so, subject to the following conditions:
    11  #
    12  #  The above copyright notice and this permission notice shall be included in all
    13  #  copies or substantial portions of the Software.
    14  #
    15  #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    16  #  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    17  #  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    18  #  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    19  #  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    20  #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    21  #  SOFTWARE.
    22  
    23  import os
    24  import json
    25  
    26  prometheus_ip = os.popen("kubectl get svc -n monitoring | grep prometheus-kube-prometheus-prometheus | awk '{print $3}'").read().strip().split('\n')[0]
    27  
    28  def get_promql_query(query):
    29      def promql_query():
    30          return "tools/bin/promql --no-headers --host 'http://" + prometheus_ip + ":9090' '" + query + "' | awk '{print $1}'"
    31      return promql_query
    32  
    33  if __name__ == "__main__":
    34      kn_status = {
    35          # Desired counts set by autoscalers.
    36          "desired_pods": get_promql_query('sum(autoscaler_desired_pods)'), 
    37          # Creating containers.
    38          "unready_pods": get_promql_query('sum(autoscaler_not_ready_pods)'),
    39          # Scheduling + image pulling.
    40          "pending_pods": get_promql_query('sum(autoscaler_pending_pods)'),
    41          # Number of pods autoscalers requested from Kubernetes.
    42          "requested_pods": get_promql_query('sum(autoscaler_requested_pods)'),
    43          "running_pods": get_promql_query('sum(autoscaler_actual_pods)'),
    44          "activator_request_count": get_promql_query('sum(activator_request_count)'),
    45  
    46          "autoscaler_stable_queue": get_promql_query('avg(autoscaler_stable_request_concurrency)'),
    47          "autoscaler_panic_queue": get_promql_query('avg(autoscaler_panic_request_concurrency)'),
    48          "activator_queue": get_promql_query('avg(activator_request_concurrency)'),
    49          
    50          # The p95 latency of single scheduling round (algorithm+binding) over a time window of 30s.
    51          "scheduling_p95": get_promql_query(
    52              'histogram_quantile(0.95, sum by (le) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[30s])))'
    53          ), 
    54          "scheduling_p50": get_promql_query(
    55              'histogram_quantile(0.50, sum by (le) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[30s])))'
    56          ),  
    57  
    58          # The p95 latency of E2E pod placement (potentially multiple scheduling rounds) over a time window of 30s.
    59          "e2e_placement_p95": get_promql_query(
    60              'histogram_quantile(0.95, sum by (le) (rate(scheduler_pod_scheduling_duration_seconds_bucket{job="kube-scheduler"}[30s])))'
    61          ),
    62          "e2e_placement_p50": get_promql_query(
    63              'histogram_quantile(0.50, sum by (le) (rate(scheduler_pod_scheduling_duration_seconds_bucket{job="kube-scheduler"}[30s])))'
    64          ), 
    65      }
    66  
    67      for label, query in kn_status.items():
    68  
    69          while True:
    70              try:
    71                  measure = os.popen(query()).read().strip()
    72                  if 'error' not in measure:
    73                      break
    74              except:
    75                  pass
    76  
    77          if label.endswith('queue'):
    78              measure = float(measure) if measure else -99
    79          elif 'p50' in label or 'p95' in label:
    80              if measure == 'NaN': 
    81                  # Not available.
    82                  measure = -99
    83              else: 
    84                  measure = float(measure) if measure else -99
    85          else:
    86              measure = int(measure) if measure else -99
    87              
    88          kn_status[label] = measure
    89      
    90      print(json.dumps(kn_status))
    91  
    92