github.com/yrj2011/jx-test-infra@v0.0.0-20190529031832-7a2065ee98eb/metrics/bigquery.py (about) 1 #!/usr/bin/env python 2 3 # Copyright 2017 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 """Runs bigquery metrics and uploads the result to GCS.""" 18 19 import argparse 20 import calendar 21 import glob 22 import json 23 import os 24 import pipes 25 import re 26 import subprocess 27 import sys 28 import time 29 import traceback 30 31 import influxdb 32 import requests 33 import yaml 34 35 36 def check(cmd, **kwargs): 37 """Logs and runs the command, raising on errors.""" 38 print >>sys.stderr, 'Run:', ' '.join(pipes.quote(c) for c in cmd), 39 if hasattr(kwargs.get('stdout'), 'name'): 40 print >>sys.stderr, ' > %s' % kwargs['stdout'].name 41 else: 42 print 43 # If 'stdin' keyword arg is a string run command and communicate string to stdin 44 if 'stdin' in kwargs and isinstance(kwargs['stdin'], str): 45 in_string = kwargs['stdin'] 46 kwargs['stdin'] = subprocess.PIPE 47 proc = subprocess.Popen(cmd, **kwargs) 48 proc.communicate(input=in_string) 49 return 50 subprocess.check_call(cmd, **kwargs) 51 52 53 def validate_metric_name(name): 54 """Raise ValueError if name is non-trivial.""" 55 # Regex '$' symbol matches an optional terminating new line 56 # so we have to check that the name 57 # doesn't have one if the regex matches. 58 if not re.match(r'^[\w-]+$', name) or name[-1] == '\n': 59 raise ValueError(name) 60 61 62 def do_jq(jq_filter, data_filename, out_filename, jq_bin='jq'): 63 """Executes jq on a file and outputs the results to a file.""" 64 with open(out_filename, 'w') as out_file: 65 check([jq_bin, jq_filter, data_filename], stdout=out_file) 66 67 68 class BigQuerier(object): 69 def __init__(self, project, bucket_path, backfill_days, influx_client): 70 if not project: 71 raise ValueError('project', project) 72 self.project = project 73 if not bucket_path: 74 print >>sys.stderr, 'Not uploading results, no bucket specified.' 75 self.prefix = bucket_path 76 77 self.influx = influx_client 78 self.backfill_days = backfill_days 79 80 def do_query(self, query, out_filename): 81 """Executes a bigquery query, outputting the results to a file.""" 82 cmd = [ 83 'bq', 'query', '--format=prettyjson', 84 '--project_id=%s' % self.project, 85 '-n100000', # Results may have more than 100 rows 86 query, 87 ] 88 with open(out_filename, 'w') as out_file: 89 check(cmd, stdout=out_file) 90 print # bq doesn't output a trailing newline 91 92 def jq_upload(self, config, data_filename): 93 """Filters a data file with jq and uploads the results to GCS.""" 94 filtered = 'daily-%s.json' % time.strftime('%Y-%m-%d') 95 latest = '%s-latest.json' % config['metric'] 96 do_jq(config['jqfilter'], data_filename, filtered) 97 98 self.copy(filtered, os.path.join(config['metric'], filtered)) 99 self.copy(filtered, latest) 100 101 def influx_upload(self, config, data_filename): 102 """Uses jq to extract InfluxDB time series points then uploads to DB.""" 103 points = '%s-data-points.json' % config['metric'] 104 jq_point = config.get('measurements', {}).get('jq', None) 105 if not jq_point: 106 return 107 do_jq(jq_point, data_filename, points) 108 with open(points) as points_file: 109 try: 110 points = json.load(points_file) 111 except ValueError: 112 print >>sys.stderr, "No influxdb points to upload.\n" 113 return 114 if not self.influx: 115 print >>sys.stderr, ( 116 'Skipping influxdb upload of metric %s, no db configured.\n' 117 % config['metric'] 118 ) 119 return 120 points = [ints_to_floats(point) for point in points] 121 self.influx.write_points(points, time_precision='s', batch_size=100) 122 123 def run_metric(self, config): 124 """Runs query and filters results, uploading data to GCS.""" 125 raw = 'raw-%s.json' % time.strftime('%Y-%m-%d') 126 127 self.update_query(config) 128 self.do_query(config['query'], raw) 129 self.copy(raw, os.path.join(config['metric'], raw)) 130 131 consumer_error = False 132 for consumer in [self.jq_upload, self.influx_upload]: 133 try: 134 consumer(config, raw) 135 except ( 136 ValueError, 137 KeyError, 138 IOError, 139 requests.exceptions.ConnectionError, 140 influxdb.client.InfluxDBClientError, 141 influxdb.client.InfluxDBServerError, 142 ): 143 print >>sys.stderr, traceback.format_exc() 144 consumer_error = True 145 if consumer_error: 146 raise ValueError('Error(s) were thrown by query result consumers.') 147 148 def copy(self, src, dest): 149 """Use gsutil to copy src to <bucket_path>/dest with minimal caching.""" 150 if not self.prefix: 151 return # no destination 152 dest = os.path.join(self.prefix, dest) 153 check(['gsutil', '-h', 'Cache-Control:max-age=60', 'cp', src, dest]) 154 155 def update_query(self, config): 156 """Modifies config['query'] based on the metric configuration.""" 157 158 # Currently the only modification that is supported is injecting the 159 # timestamp of the most recent influxdb data for a given metric. 160 # (For backfilling) 161 measure = config.get('measurements', {}).get('backfill') 162 if not measure: 163 return 164 if self.influx: 165 # To get the last data point timestamp we must also fetch a field. 166 # So first find a field that we can query if the metric exists. 167 points = self.influx.query('show field keys from %s limit 1' % measure) 168 points = list(points.get_points()) 169 170 field = points and points[0].get('fieldKey') 171 last_time = None 172 if field: 173 results = self.influx.query( 174 'select last(%s), time from %s limit 1' % (field, measure) 175 ) 176 last_time = next(results.get_points(), {}).get('time') 177 if last_time: 178 # format time properly 179 last_time = time.strptime(last_time, '%Y-%m-%dT%H:%M:%SZ') 180 last_time = calendar.timegm(last_time) 181 if not last_time: 182 last_time = int(time.time() - (60*60*24*self.backfill_days)) 183 else: 184 # InfluxDB is not enabled so skip backfill so use default 185 last_time = int(time.time() - (60*60*24)*self.backfill_days) 186 187 # replace tag with formatted time 188 config['query'] = config['query'].replace('<LAST_DATA_TIME>', str(last_time)) 189 190 191 def all_configs(search='**.yaml'): 192 """Returns config files in the metrics dir.""" 193 return glob.glob(os.path.join( 194 os.path.dirname(__file__), 'configs', search)) 195 196 197 def make_influx_client(): 198 """Make an InfluxDB client from config at path $VELODROME_INFLUXDB_CONFIG""" 199 if 'VELODROME_INFLUXDB_CONFIG' not in os.environ: 200 return None 201 202 with open(os.environ['VELODROME_INFLUXDB_CONFIG']) as config_file: 203 config = json.load(config_file) 204 205 def check_config(field): 206 if not field in config: 207 raise ValueError('DB client config needs field \'%s\'' % field) 208 check_config('host') 209 check_config('port') 210 check_config('user') 211 check_config('password') 212 return influxdb.InfluxDBClient( 213 host=config['host'], 214 port=config['port'], 215 username=config['user'], 216 password=config['password'], 217 database='metrics', 218 ) 219 220 221 def ints_to_floats(point): 222 for key, val in point.iteritems(): 223 if key == 'time': 224 continue 225 if isinstance(val, int): 226 point[key] = float(val) 227 elif isinstance(val, dict): 228 point[key] = ints_to_floats(val) 229 return point 230 231 232 def main(configs, project, bucket_path, backfill_days): 233 """Loads metric config files and runs each metric.""" 234 queryer = BigQuerier(project, bucket_path, backfill_days, make_influx_client()) 235 236 # the 'bq show' command is called as a hack to dodge the config prompts that bq presents 237 # the first time it is run. A newline is passed to stdin to skip the prompt for default project 238 # when the service account in use has access to multiple projects. 239 check(['bq', 'show'], stdin='\n') 240 241 errs = [] 242 for path in configs or all_configs(): 243 try: 244 with open(path) as config_raw: 245 config = yaml.safe_load(config_raw) 246 if not config: 247 raise ValueError('invalid yaml: %s.' % path) 248 config['metric'] = config['metric'].strip() 249 validate_metric_name(config['metric']) 250 queryer.run_metric(config) 251 except ( 252 ValueError, 253 KeyError, 254 IOError, 255 subprocess.CalledProcessError, 256 ): 257 print >>sys.stderr, traceback.format_exc() 258 errs.append(path) 259 260 if errs: 261 print 'Failed %d configs: %s' % (len(errs), ', '.join(errs)) 262 sys.exit(1) 263 264 265 if __name__ == '__main__': 266 PARSER = argparse.ArgumentParser() 267 PARSER.add_argument( 268 '--config', action='append', help='YAML file describing a metric.') 269 PARSER.add_argument( 270 '--project', 271 default='k8s-gubernator', 272 help='Charge the specified account for bigquery usage.') 273 PARSER.add_argument( 274 '--bucket', 275 help='Upload results to the specified gcs bucket.') 276 PARSER.add_argument( 277 '--backfill-days', 278 default=30, 279 type=int, 280 help='Number of days to backfill influxdb data.') 281 282 ARGS = PARSER.parse_args() 283 main(ARGS.config, ARGS.project, ARGS.bucket, ARGS.backfill_days)