github.com/yrj2011/jx-test-infra@v0.0.0-20190529031832-7a2065ee98eb/metrics/bigquery.py (about)

     1  #!/usr/bin/env python
     2  
     3  # Copyright 2017 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  """Runs bigquery metrics and uploads the result to GCS."""
    18  
    19  import argparse
    20  import calendar
    21  import glob
    22  import json
    23  import os
    24  import pipes
    25  import re
    26  import subprocess
    27  import sys
    28  import time
    29  import traceback
    30  
    31  import influxdb
    32  import requests
    33  import yaml
    34  
    35  
    36  def check(cmd, **kwargs):
    37      """Logs and runs the command, raising on errors."""
    38      print >>sys.stderr, 'Run:', ' '.join(pipes.quote(c) for c in cmd),
    39      if hasattr(kwargs.get('stdout'), 'name'):
    40          print >>sys.stderr, ' > %s' % kwargs['stdout'].name
    41      else:
    42          print
    43      # If 'stdin' keyword arg is a string run command and communicate string to stdin
    44      if 'stdin' in kwargs and isinstance(kwargs['stdin'], str):
    45          in_string = kwargs['stdin']
    46          kwargs['stdin'] = subprocess.PIPE
    47          proc = subprocess.Popen(cmd, **kwargs)
    48          proc.communicate(input=in_string)
    49          return
    50      subprocess.check_call(cmd, **kwargs)
    51  
    52  
    53  def validate_metric_name(name):
    54      """Raise ValueError if name is non-trivial."""
    55      # Regex '$' symbol matches an optional terminating new line
    56      # so we have to check that the name
    57      # doesn't have one if the regex matches.
    58      if not re.match(r'^[\w-]+$', name) or name[-1] == '\n':
    59          raise ValueError(name)
    60  
    61  
    62  def do_jq(jq_filter, data_filename, out_filename, jq_bin='jq'):
    63      """Executes jq on a file and outputs the results to a file."""
    64      with open(out_filename, 'w') as out_file:
    65          check([jq_bin, jq_filter, data_filename], stdout=out_file)
    66  
    67  
    68  class BigQuerier(object):
    69      def __init__(self, project, bucket_path, backfill_days, influx_client):
    70          if not project:
    71              raise ValueError('project', project)
    72          self.project = project
    73          if not bucket_path:
    74              print >>sys.stderr, 'Not uploading results, no bucket specified.'
    75          self.prefix = bucket_path
    76  
    77          self.influx = influx_client
    78          self.backfill_days = backfill_days
    79  
    80      def do_query(self, query, out_filename):
    81          """Executes a bigquery query, outputting the results to a file."""
    82          cmd = [
    83              'bq', 'query', '--format=prettyjson',
    84              '--project_id=%s' % self.project,
    85              '-n100000',  # Results may have more than 100 rows
    86              query,
    87          ]
    88          with open(out_filename, 'w') as out_file:
    89              check(cmd, stdout=out_file)
    90              print  # bq doesn't output a trailing newline
    91  
    92      def jq_upload(self, config, data_filename):
    93          """Filters a data file with jq and uploads the results to GCS."""
    94          filtered = 'daily-%s.json' % time.strftime('%Y-%m-%d')
    95          latest = '%s-latest.json' % config['metric']
    96          do_jq(config['jqfilter'], data_filename, filtered)
    97  
    98          self.copy(filtered, os.path.join(config['metric'], filtered))
    99          self.copy(filtered, latest)
   100  
   101      def influx_upload(self, config, data_filename):
   102          """Uses jq to extract InfluxDB time series points then uploads to DB."""
   103          points = '%s-data-points.json' % config['metric']
   104          jq_point = config.get('measurements', {}).get('jq', None)
   105          if not jq_point:
   106              return
   107          do_jq(jq_point, data_filename, points)
   108          with open(points) as points_file:
   109              try:
   110                  points = json.load(points_file)
   111              except ValueError:
   112                  print >>sys.stderr, "No influxdb points to upload.\n"
   113                  return
   114          if not self.influx:
   115              print >>sys.stderr, (
   116                  'Skipping influxdb upload of metric %s, no db configured.\n'
   117                  % config['metric']
   118              )
   119              return
   120          points = [ints_to_floats(point) for point in points]
   121          self.influx.write_points(points, time_precision='s', batch_size=100)
   122  
   123      def run_metric(self, config):
   124          """Runs query and filters results, uploading data to GCS."""
   125          raw = 'raw-%s.json' % time.strftime('%Y-%m-%d')
   126  
   127          self.update_query(config)
   128          self.do_query(config['query'], raw)
   129          self.copy(raw, os.path.join(config['metric'], raw))
   130  
   131          consumer_error = False
   132          for consumer in [self.jq_upload, self.influx_upload]:
   133              try:
   134                  consumer(config, raw)
   135              except (
   136                      ValueError,
   137                      KeyError,
   138                      IOError,
   139                      requests.exceptions.ConnectionError,
   140                      influxdb.client.InfluxDBClientError,
   141                      influxdb.client.InfluxDBServerError,
   142                  ):
   143                  print >>sys.stderr, traceback.format_exc()
   144                  consumer_error = True
   145          if consumer_error:
   146              raise ValueError('Error(s) were thrown by query result consumers.')
   147  
   148      def copy(self, src, dest):
   149          """Use gsutil to copy src to <bucket_path>/dest with minimal caching."""
   150          if not self.prefix:
   151              return  # no destination
   152          dest = os.path.join(self.prefix, dest)
   153          check(['gsutil', '-h', 'Cache-Control:max-age=60', 'cp', src, dest])
   154  
   155      def update_query(self, config):
   156          """Modifies config['query'] based on the metric configuration."""
   157  
   158          # Currently the only modification that is supported is injecting the
   159          # timestamp of the most recent influxdb data for a given metric.
   160          # (For backfilling)
   161          measure = config.get('measurements', {}).get('backfill')
   162          if not measure:
   163              return
   164          if self.influx:
   165              # To get the last data point timestamp we must also fetch a field.
   166              # So first find a field that we can query if the metric exists.
   167              points = self.influx.query('show field keys from %s limit 1' % measure)
   168              points = list(points.get_points())
   169  
   170              field = points and points[0].get('fieldKey')
   171              last_time = None
   172              if field:
   173                  results = self.influx.query(
   174                      'select last(%s), time from %s limit 1' % (field, measure)
   175                  )
   176                  last_time = next(results.get_points(), {}).get('time')
   177                  if last_time:
   178                      # format time properly
   179                      last_time = time.strptime(last_time, '%Y-%m-%dT%H:%M:%SZ')
   180                      last_time = calendar.timegm(last_time)
   181              if not last_time:
   182                  last_time = int(time.time() - (60*60*24*self.backfill_days))
   183          else:
   184              # InfluxDB is not enabled so skip backfill so use default
   185              last_time = int(time.time() - (60*60*24)*self.backfill_days)
   186  
   187          # replace tag with formatted time
   188          config['query'] = config['query'].replace('<LAST_DATA_TIME>', str(last_time))
   189  
   190  
   191  def all_configs(search='**.yaml'):
   192      """Returns config files in the metrics dir."""
   193      return glob.glob(os.path.join(
   194          os.path.dirname(__file__), 'configs', search))
   195  
   196  
   197  def make_influx_client():
   198      """Make an InfluxDB client from config at path $VELODROME_INFLUXDB_CONFIG"""
   199      if 'VELODROME_INFLUXDB_CONFIG' not in os.environ:
   200          return None
   201  
   202      with open(os.environ['VELODROME_INFLUXDB_CONFIG']) as config_file:
   203          config = json.load(config_file)
   204  
   205      def check_config(field):
   206          if not field in config:
   207              raise ValueError('DB client config needs field \'%s\'' % field)
   208      check_config('host')
   209      check_config('port')
   210      check_config('user')
   211      check_config('password')
   212      return influxdb.InfluxDBClient(
   213          host=config['host'],
   214          port=config['port'],
   215          username=config['user'],
   216          password=config['password'],
   217          database='metrics',
   218      )
   219  
   220  
   221  def ints_to_floats(point):
   222      for key, val in point.iteritems():
   223          if key == 'time':
   224              continue
   225          if isinstance(val, int):
   226              point[key] = float(val)
   227          elif isinstance(val, dict):
   228              point[key] = ints_to_floats(val)
   229      return point
   230  
   231  
   232  def main(configs, project, bucket_path, backfill_days):
   233      """Loads metric config files and runs each metric."""
   234      queryer = BigQuerier(project, bucket_path, backfill_days, make_influx_client())
   235  
   236      # the 'bq show' command is called as a hack to dodge the config prompts that bq presents
   237      # the first time it is run. A newline is passed to stdin to skip the prompt for default project
   238      # when the service account in use has access to multiple projects.
   239      check(['bq', 'show'], stdin='\n')
   240  
   241      errs = []
   242      for path in configs or all_configs():
   243          try:
   244              with open(path) as config_raw:
   245                  config = yaml.safe_load(config_raw)
   246              if not config:
   247                  raise ValueError('invalid yaml: %s.' % path)
   248              config['metric'] = config['metric'].strip()
   249              validate_metric_name(config['metric'])
   250              queryer.run_metric(config)
   251          except (
   252                  ValueError,
   253                  KeyError,
   254                  IOError,
   255                  subprocess.CalledProcessError,
   256              ):
   257              print >>sys.stderr, traceback.format_exc()
   258              errs.append(path)
   259  
   260      if errs:
   261          print 'Failed %d configs: %s' % (len(errs), ', '.join(errs))
   262          sys.exit(1)
   263  
   264  
   265  if __name__ == '__main__':
   266      PARSER = argparse.ArgumentParser()
   267      PARSER.add_argument(
   268          '--config', action='append', help='YAML file describing a metric.')
   269      PARSER.add_argument(
   270          '--project',
   271          default='k8s-gubernator',
   272          help='Charge the specified account for bigquery usage.')
   273      PARSER.add_argument(
   274          '--bucket',
   275          help='Upload results to the specified gcs bucket.')
   276      PARSER.add_argument(
   277          '--backfill-days',
   278          default=30,
   279          type=int,
   280          help='Number of days to backfill influxdb data.')
   281  
   282      ARGS = PARSER.parse_args()
   283      main(ARGS.config, ARGS.project, ARGS.bucket, ARGS.backfill_days)