k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/metrics/bigquery.py (about) 1 #!/usr/bin/env python3 2 3 # Copyright 2017 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 """Runs bigquery metrics and uploads the result to GCS.""" 18 19 import argparse 20 import glob 21 import os 22 import pipes 23 import re 24 import subprocess 25 import sys 26 import time 27 import traceback 28 29 import requests 30 import ruamel.yaml as yaml 31 32 BACKFILL_DAYS = 30 33 DEFAULT_JQ_BIN = '/usr/bin/jq' 34 35 def check(cmd, **kwargs): 36 """Logs and runs the command, raising on errors.""" 37 print('Run:', ' '.join(pipes.quote(c) for c in cmd), end=' ', file=sys.stderr) 38 if hasattr(kwargs.get('stdout'), 'name'): 39 print(' > %s' % kwargs['stdout'].name, file=sys.stderr) 40 else: 41 print() 42 # If 'stdin' keyword arg is a string run command and communicate string to stdin 43 if 'stdin' in kwargs and isinstance(kwargs['stdin'], str): 44 in_string = kwargs['stdin'] 45 kwargs['stdin'] = subprocess.PIPE 46 proc = subprocess.Popen(cmd, **kwargs) 47 proc.communicate(input=in_string.encode('utf-8')) 48 return 49 subprocess.check_call(cmd, **kwargs) 50 51 52 def validate_metric_name(name): 53 """Raise ValueError if name is non-trivial.""" 54 # Regex '$' symbol matches an optional terminating new line 55 # so we have to check that the name 56 # doesn't have one if the regex matches. 57 if not re.match(r'^[\w-]+$', name) or name[-1] == '\n': 58 raise ValueError(name) 59 60 61 def do_jq(jq_filter, data_filename, out_filename, jq_bin=DEFAULT_JQ_BIN): 62 """Executes jq on a file and outputs the results to a file.""" 63 with open(out_filename, 'w') as out_file: 64 check([jq_bin, jq_filter, data_filename], stdout=out_file) 65 66 67 class BigQuerier: 68 def __init__(self, project, bucket_path): 69 if not project: 70 raise ValueError('project', project) 71 self.project = project 72 if not bucket_path: 73 print('Not uploading results, no bucket specified.', file=sys.stderr) 74 self.prefix = bucket_path 75 76 def do_query(self, query, out_filename): 77 """Executes a bigquery query, outputting the results to a file.""" 78 cmd = [ 79 'bq', 'query', '--format=prettyjson', 80 '--project_id=%s' % self.project, 81 '--max_rows=1000000', # Results may have more than 100 rows 82 query, 83 ] 84 with open(out_filename, 'w') as out_file: 85 check(cmd, stdout=out_file) 86 out_file.write('\n') 87 88 def jq_upload(self, config, data_filename): 89 """Filters a data file with jq and uploads the results to GCS.""" 90 filtered = 'daily-%s.json' % time.strftime('%Y-%m-%d') 91 latest = '%s-latest.json' % config['metric'] 92 do_jq(config['jqfilter'], data_filename, filtered) 93 94 self.copy(filtered, os.path.join(config['metric'], filtered)) 95 self.copy(filtered, latest) 96 97 def run_metric(self, config): 98 """Runs query and filters results, uploading data to GCS.""" 99 raw = 'raw-%s.json' % time.strftime('%Y-%m-%d') 100 101 self.update_query(config) 102 self.do_query(config['query'], raw) 103 self.copy(raw, os.path.join(config['metric'], raw)) 104 105 consumer_error = False 106 for consumer in [self.jq_upload]: 107 try: 108 consumer(config, raw) 109 except ( 110 ValueError, 111 KeyError, 112 IOError, 113 requests.exceptions.ConnectionError, 114 ): 115 print(traceback.format_exc(), file=sys.stderr) 116 consumer_error = True 117 if consumer_error: 118 raise ValueError('Error(s) were thrown by query result consumers.') 119 120 def copy(self, src, dest): 121 """Use gsutil to copy src to <bucket_path>/dest with minimal caching.""" 122 if not self.prefix: 123 return # no destination 124 dest = os.path.join(self.prefix, dest) 125 check(['gsutil', '-h', 'Cache-Control:max-age=60', 'cp', src, dest]) 126 127 @staticmethod 128 def update_query(config): 129 """Modifies config['query'] based on the metric configuration.""" 130 last_time = int(time.time() - (60*60*24)*BACKFILL_DAYS) 131 config['query'] = config['query'].replace('<LAST_DATA_TIME>', str(last_time)) 132 133 134 def all_configs(search='**.yaml'): 135 """Returns config files in the metrics dir.""" 136 return glob.glob(os.path.join( 137 os.path.dirname(__file__), 'configs', search)) 138 139 140 def ints_to_floats(point): 141 for key, val in point.items(): 142 if key == 'time': 143 continue 144 if isinstance(val, int): 145 point[key] = float(val) 146 elif isinstance(val, dict): 147 point[key] = ints_to_floats(val) 148 return point 149 150 151 def main(configs, project, bucket_path): 152 """Loads metric config files and runs each metric.""" 153 queryer = BigQuerier(project, bucket_path) 154 155 # authenticate as the given service account if our environment is providing one 156 if 'GOOGLE_APPLICATION_CREDENTIALS' in os.environ: 157 keyfile = os.environ['GOOGLE_APPLICATION_CREDENTIALS'] 158 check(['gcloud', 'auth', 'activate-service-account', f'--key-file={keyfile}']) 159 160 # the 'bq show' command is called as a hack to dodge the config prompts that bq presents 161 # the first time it is run. A newline is passed to stdin to skip the prompt for default project 162 # when the service account in use has access to multiple projects. 163 check(['bq', 'show'], stdin='\n') 164 165 errs = [] 166 for path in configs or all_configs(): 167 try: 168 with open(path) as config_raw: 169 config = yaml.safe_load(config_raw) 170 if not config: 171 raise ValueError('invalid yaml: %s.' % path) 172 config['metric'] = config['metric'].strip() 173 validate_metric_name(config['metric']) 174 queryer.run_metric(config) 175 except ( 176 ValueError, 177 KeyError, 178 IOError, 179 subprocess.CalledProcessError, 180 ): 181 print(traceback.format_exc(), file=sys.stderr) 182 errs.append(path) 183 184 if errs: 185 print('Failed %d configs: %s' % (len(errs), ', '.join(errs))) 186 sys.exit(1) 187 188 189 if __name__ == '__main__': 190 PARSER = argparse.ArgumentParser() 191 PARSER.add_argument( 192 '--config', action='append', help='YAML file describing a metric.') 193 PARSER.add_argument( 194 '--project', 195 default='k8s-gubernator', 196 help='Charge the specified account for bigquery usage.') 197 PARSER.add_argument( 198 '--bucket', 199 help='Upload results to the specified gcs bucket.') 200 PARSER.add_argument( 201 '--jq', 202 help='path to jq binary') 203 204 ARGS = PARSER.parse_args() 205 if ARGS.jq: 206 DEFAULT_JQ_BIN = ARGS.jq 207 main(ARGS.config, ARGS.project, ARGS.bucket)