k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/metrics/bigquery.py (about)

     1  #!/usr/bin/env python3
     2  
     3  # Copyright 2017 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  """Runs bigquery metrics and uploads the result to GCS."""
    18  
    19  import argparse
    20  import glob
    21  import os
    22  import pipes
    23  import re
    24  import subprocess
    25  import sys
    26  import time
    27  import traceback
    28  
    29  import requests
    30  import ruamel.yaml as yaml
    31  
    32  BACKFILL_DAYS = 30
    33  DEFAULT_JQ_BIN = '/usr/bin/jq'
    34  
    35  def check(cmd, **kwargs):
    36      """Logs and runs the command, raising on errors."""
    37      print('Run:', ' '.join(pipes.quote(c) for c in cmd), end=' ', file=sys.stderr)
    38      if hasattr(kwargs.get('stdout'), 'name'):
    39          print(' > %s' % kwargs['stdout'].name, file=sys.stderr)
    40      else:
    41          print()
    42      # If 'stdin' keyword arg is a string run command and communicate string to stdin
    43      if 'stdin' in kwargs and isinstance(kwargs['stdin'], str):
    44          in_string = kwargs['stdin']
    45          kwargs['stdin'] = subprocess.PIPE
    46          proc = subprocess.Popen(cmd, **kwargs)
    47          proc.communicate(input=in_string.encode('utf-8'))
    48          return
    49      subprocess.check_call(cmd, **kwargs)
    50  
    51  
    52  def validate_metric_name(name):
    53      """Raise ValueError if name is non-trivial."""
    54      # Regex '$' symbol matches an optional terminating new line
    55      # so we have to check that the name
    56      # doesn't have one if the regex matches.
    57      if not re.match(r'^[\w-]+$', name) or name[-1] == '\n':
    58          raise ValueError(name)
    59  
    60  
    61  def do_jq(jq_filter, data_filename, out_filename, jq_bin=DEFAULT_JQ_BIN):
    62      """Executes jq on a file and outputs the results to a file."""
    63      with open(out_filename, 'w') as out_file:
    64          check([jq_bin, jq_filter, data_filename], stdout=out_file)
    65  
    66  
    67  class BigQuerier:
    68      def __init__(self, project, bucket_path):
    69          if not project:
    70              raise ValueError('project', project)
    71          self.project = project
    72          if not bucket_path:
    73              print('Not uploading results, no bucket specified.', file=sys.stderr)
    74          self.prefix = bucket_path
    75  
    76      def do_query(self, query, out_filename):
    77          """Executes a bigquery query, outputting the results to a file."""
    78          cmd = [
    79              'bq', 'query', '--format=prettyjson',
    80              '--project_id=%s' % self.project,
    81              '--max_rows=1000000',  # Results may have more than 100 rows
    82              query,
    83          ]
    84          with open(out_filename, 'w') as out_file:
    85              check(cmd, stdout=out_file)
    86              out_file.write('\n')
    87  
    88      def jq_upload(self, config, data_filename):
    89          """Filters a data file with jq and uploads the results to GCS."""
    90          filtered = 'daily-%s.json' % time.strftime('%Y-%m-%d')
    91          latest = '%s-latest.json' % config['metric']
    92          do_jq(config['jqfilter'], data_filename, filtered)
    93  
    94          self.copy(filtered, os.path.join(config['metric'], filtered))
    95          self.copy(filtered, latest)
    96  
    97      def run_metric(self, config):
    98          """Runs query and filters results, uploading data to GCS."""
    99          raw = 'raw-%s.json' % time.strftime('%Y-%m-%d')
   100  
   101          self.update_query(config)
   102          self.do_query(config['query'], raw)
   103          self.copy(raw, os.path.join(config['metric'], raw))
   104  
   105          consumer_error = False
   106          for consumer in [self.jq_upload]:
   107              try:
   108                  consumer(config, raw)
   109              except (
   110                      ValueError,
   111                      KeyError,
   112                      IOError,
   113                      requests.exceptions.ConnectionError,
   114                  ):
   115                  print(traceback.format_exc(), file=sys.stderr)
   116                  consumer_error = True
   117          if consumer_error:
   118              raise ValueError('Error(s) were thrown by query result consumers.')
   119  
   120      def copy(self, src, dest):
   121          """Use gsutil to copy src to <bucket_path>/dest with minimal caching."""
   122          if not self.prefix:
   123              return  # no destination
   124          dest = os.path.join(self.prefix, dest)
   125          check(['gsutil', '-h', 'Cache-Control:max-age=60', 'cp', src, dest])
   126  
   127      @staticmethod
   128      def update_query(config):
   129          """Modifies config['query'] based on the metric configuration."""
   130          last_time = int(time.time() - (60*60*24)*BACKFILL_DAYS)
   131          config['query'] = config['query'].replace('<LAST_DATA_TIME>', str(last_time))
   132  
   133  
   134  def all_configs(search='**.yaml'):
   135      """Returns config files in the metrics dir."""
   136      return glob.glob(os.path.join(
   137          os.path.dirname(__file__), 'configs', search))
   138  
   139  
   140  def ints_to_floats(point):
   141      for key, val in point.items():
   142          if key == 'time':
   143              continue
   144          if isinstance(val, int):
   145              point[key] = float(val)
   146          elif isinstance(val, dict):
   147              point[key] = ints_to_floats(val)
   148      return point
   149  
   150  
   151  def main(configs, project, bucket_path):
   152      """Loads metric config files and runs each metric."""
   153      queryer = BigQuerier(project, bucket_path)
   154  
   155      # authenticate as the given service account if our environment is providing one
   156      if 'GOOGLE_APPLICATION_CREDENTIALS' in os.environ:
   157          keyfile = os.environ['GOOGLE_APPLICATION_CREDENTIALS']
   158          check(['gcloud', 'auth', 'activate-service-account', f'--key-file={keyfile}'])
   159  
   160      # the 'bq show' command is called as a hack to dodge the config prompts that bq presents
   161      # the first time it is run. A newline is passed to stdin to skip the prompt for default project
   162      # when the service account in use has access to multiple projects.
   163      check(['bq', 'show'], stdin='\n')
   164  
   165      errs = []
   166      for path in configs or all_configs():
   167          try:
   168              with open(path) as config_raw:
   169                  config = yaml.safe_load(config_raw)
   170              if not config:
   171                  raise ValueError('invalid yaml: %s.' % path)
   172              config['metric'] = config['metric'].strip()
   173              validate_metric_name(config['metric'])
   174              queryer.run_metric(config)
   175          except (
   176                  ValueError,
   177                  KeyError,
   178                  IOError,
   179                  subprocess.CalledProcessError,
   180              ):
   181              print(traceback.format_exc(), file=sys.stderr)
   182              errs.append(path)
   183  
   184      if errs:
   185          print('Failed %d configs: %s' % (len(errs), ', '.join(errs)))
   186          sys.exit(1)
   187  
   188  
   189  if __name__ == '__main__':
   190      PARSER = argparse.ArgumentParser()
   191      PARSER.add_argument(
   192          '--config', action='append', help='YAML file describing a metric.')
   193      PARSER.add_argument(
   194          '--project',
   195          default='k8s-gubernator',
   196          help='Charge the specified account for bigquery usage.')
   197      PARSER.add_argument(
   198          '--bucket',
   199          help='Upload results to the specified gcs bucket.')
   200      PARSER.add_argument(
   201          '--jq',
   202          help='path to jq binary')
   203  
   204      ARGS = PARSER.parse_args()
   205      if ARGS.jq:
   206          DEFAULT_JQ_BIN = ARGS.jq
   207      main(ARGS.config, ARGS.project, ARGS.bucket)