github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/kettle/stream.py (about) 1 #!/usr/bin/env python 2 # Copyright 2017 The Kubernetes Authors. 3 # 4 # Licensed under the Apache License, Version 2.0 (the "License"); 5 # you may not use this file except in compliance with the License. 6 # You may obtain a copy of the License at 7 # 8 # http://www.apache.org/licenses/LICENSE-2.0 9 # 10 # Unless required by applicable law or agreed to in writing, software 11 # distributed under the License is distributed on an "AS IS" BASIS, 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 # See the License for the specific language governing permissions and 14 # limitations under the License. 15 16 """Receive push events for new builds and upload rows to BigQuery.""" 17 18 from __future__ import print_function 19 20 import argparse 21 import json 22 import os 23 import pprint 24 import socket 25 import sys 26 import traceback 27 import time 28 29 import multiprocessing.pool 30 31 try: 32 from google.cloud import bigquery 33 from google.cloud import pubsub 34 import google.cloud.exceptions 35 except ImportError: 36 print('WARNING: unable to load google cloud (test environment?)') 37 traceback.print_exc() 38 39 import model 40 import make_db 41 import make_json 42 43 44 def process_changes(results): 45 """Split GCS change events into trivial acks and builds to further process.""" 46 acks = [] # pubsub message ids to acknowledge 47 todo = [] # (id, job, build) of builds to grab 48 49 # process results, find finished builds to process 50 for ack_id, message in results: 51 if message.attributes['eventType'] != 'OBJECT_FINALIZE': 52 acks.append(ack_id) 53 continue 54 obj = message.attributes['objectId'] 55 if not obj.endswith('/finished.json'): 56 acks.append(ack_id) 57 continue 58 job, build = obj[:-len('/finished.json')].rsplit('/', 1) 59 job = 'gs://%s/%s' % (message.attributes['bucketId'], job) 60 todo.append((ack_id, job, build)) 61 62 return acks, todo 63 64 65 def get_started_finished(gcs_client, db, todo): 66 """Download started/finished.json from build dirs in todo.""" 67 acks = [] 68 build_dirs = [] 69 pool = multiprocessing.pool.ThreadPool(16) 70 try: 71 for ack_id, (build_dir, started, finished) in pool.imap_unordered( 72 lambda (ack_id, job, build): (ack_id, gcs_client.get_started_finished(job, build)), 73 todo): 74 if finished: 75 if not db.insert_build(build_dir, started, finished): 76 print('already present??') 77 start = time.localtime(started.get('timestamp', 0) if started else 0) 78 print(build_dir, bool(started), bool(finished), 79 time.strftime('%F %T %Z', start), 80 finished and finished.get('result')) 81 build_dirs.append(build_dir) 82 acks.append(ack_id) 83 else: 84 print('finished.json missing?', build_dir, started, finished) 85 finally: 86 pool.close() 87 db.commit() 88 return acks, build_dirs 89 90 91 def row_to_mapping(row, schema): 92 """Convert a dictionary to a list for bigquery.Table.insert_data. 93 94 Silly. See https://github.com/GoogleCloudPlatform/google-cloud-python/issues/3396 95 """ 96 return [row.get(field.name, [] if field.mode == 'REPEATED' else None) for field in schema] 97 98 99 def retry(func, *args, **kwargs): 100 """Run a function with arguments, retrying on server errors. """ 101 # pylint: disable=no-member 102 for attempt in xrange(20): 103 try: 104 return func(*args, **kwargs) 105 except (socket.error, google.cloud.exceptions.ServerError): 106 # retry with exponential backoff 107 traceback.print_exc() 108 time.sleep(1.4 ** attempt) 109 return func(*args, **kwargs) # one last attempt 110 111 112 def insert_data(table, rows_iter): 113 """Upload rows from rows_iter into bigquery table table. 114 115 rows_iter should return a series of (row_id, row dictionary) tuples. 116 The row dictionary must match the table's schema. 117 118 Returns the row_ids that were inserted. 119 """ 120 emitted = set() 121 122 rows = [] 123 row_ids = [] 124 125 for row_id, row in rows_iter: 126 emitted.add(row_id) 127 if len(json.dumps(row)) > 1e6: 128 print('ERROR: row too long', row['path']) 129 continue 130 row = row_to_mapping(row, table.schema) 131 rows.append(row) 132 row_ids.append(row_id) 133 134 if not rows: # nothing to do 135 return [] 136 137 def insert(table, rows, row_ids): 138 """Insert rows with row_ids into table, retrying as necessary.""" 139 errors = retry(table.insert_data, rows, row_ids, skip_invalid_rows=True) 140 141 if not errors: 142 print('Loaded {} builds into {}'.format(len(rows), table.name)) 143 else: 144 print('Errors:') 145 pprint.pprint(errors) 146 pprint.pprint(table.schema) 147 148 if len(json.dumps(rows)) > 10e6: 149 print('WARNING: too big for one insert, doing stupid slow version') 150 for row, row_id in zip(rows, row_ids): 151 insert(table, [row], [row_id]) 152 else: 153 insert(table, rows, row_ids) 154 155 return emitted 156 157 158 def main(db, sub, tables, client_class=make_db.GCSClient, stop=None): 159 # pylint: disable=too-many-locals 160 gcs_client = client_class('', {}) 161 162 if stop is None: 163 stop = lambda: False 164 165 results = [0] * 1000 # don't sleep on first loop 166 while not stop(): 167 print() 168 if len(results) < 10 and client_class is make_db.GCSClient: 169 time.sleep(5) # slow down! 170 171 print('====', time.strftime("%F %T %Z"), '=' * 40) 172 173 results = retry(sub.pull, max_messages=1000) 174 start = time.time() 175 while time.time() < start + 7: 176 results_more = sub.pull(max_messages=1000, return_immediately=True) 177 if not results_more: 178 break 179 results += results_more 180 181 print('PULLED', len(results)) 182 183 acks, todo = process_changes(results) 184 185 if acks: 186 print('ACK irrelevant', len(acks)) 187 for n in xrange(0, len(acks), 1000): 188 retry(sub.acknowledge, acks[n: n + 1000]) 189 190 if todo: 191 print('EXTEND-ACK ', len(todo)) 192 # give 3 minutes to grab build details 193 retry(sub.modify_ack_deadline, [i for i, _j, _b in todo], 60*3) 194 195 acks, build_dirs = get_started_finished(gcs_client, db, todo) 196 197 # notify pubsub queue that we've handled the finished.json messages 198 if acks: 199 print('ACK "finished.json"', len(acks)) 200 retry(sub.acknowledge, acks) 201 202 # grab junit files for new builds 203 make_db.download_junit(db, 16, client_class) 204 205 # stream new rows to tables 206 if build_dirs and tables: 207 for table, incremental_table in tables.itervalues(): 208 builds = db.get_builds_from_paths(build_dirs, incremental_table) 209 emitted = insert_data(table, make_json.make_rows(db, builds)) 210 db.insert_emitted(emitted, incremental_table) 211 212 213 def load_sub(poll): 214 """Return the PubSub subscription specified by the /-separated input.""" 215 project, topic, subscription = poll.split('/') 216 pubsub_client = pubsub.Client(project) 217 return pubsub_client.topic(topic).subscription(subscription) 218 219 220 def load_schema(schemafield): 221 """Construct the expected BigQuery schema from files on disk. 222 223 Only used for new tables.""" 224 basedir = os.path.dirname(__file__) 225 schema_json = json.load(open(os.path.join(basedir, 'schema.json'))) 226 def make_field(spec): 227 spec['field_type'] = spec.pop('type') 228 if 'fields' in spec: 229 spec['fields'] = [make_field(f) for f in spec['fields']] 230 return schemafield(**spec) 231 return [make_field(s) for s in schema_json] 232 233 234 def load_tables(dataset, tablespecs): 235 """Construct a dictionary of BigQuery tables given the input tablespec. 236 237 Args: 238 dataset: bigquery.Dataset 239 tablespecs: list of strings of "NAME:DAYS", e.g. ["day:1"] 240 Returns: 241 {name: (bigquery.Table, incremental table name)} 242 """ 243 project, dataset_name = dataset.split(':') 244 dataset = bigquery.Client(project).dataset(dataset_name) 245 246 tables = {} 247 for spec in tablespecs: 248 name, days = spec.split(':') 249 table = dataset.table(name) 250 try: 251 table.reload() 252 except google.cloud.exceptions.NotFound: # pylint: disable=no-member 253 table.schema = load_schema(bigquery.schema.SchemaField) 254 table.create() 255 tables[name] = (table, make_json.get_table(float(days))) 256 return tables 257 258 259 class StopWhen(object): 260 """A simple object that returns True once when the given hour begins.""" 261 def __init__(self, target, clock=lambda: time.localtime().tm_hour): 262 self.clock = clock 263 self.last = self.clock() 264 self.target = target 265 266 def __call__(self): 267 if os.path.exists('stop'): 268 return True 269 now = self.clock() 270 last = self.last 271 self.last = now 272 return now != last and now == self.target 273 274 275 def get_options(argv): 276 """Process command line arguments.""" 277 parser = argparse.ArgumentParser() 278 parser.add_argument( 279 '--poll', 280 required=True, 281 help='Follow GCS changes from project/topic/subscription', 282 ) 283 parser.add_argument( 284 '--dataset', 285 help='BigQuery dataset (e.g. k8s-gubernator:build)' 286 ) 287 parser.add_argument( 288 '--tables', 289 nargs='+', 290 default=[], 291 help='Upload rows to table:days [e.g. --tables day:1 week:7 all:0]', 292 ) 293 parser.add_argument( 294 '--stop_at', 295 type=int, 296 help='Terminate when this hour (0-23) rolls around (in local time).' 297 ) 298 return parser.parse_args(argv) 299 300 301 if __name__ == '__main__': 302 OPTIONS = get_options(sys.argv[1:]) 303 304 main(model.Database(), 305 load_sub(OPTIONS.poll), 306 load_tables(OPTIONS.dataset, OPTIONS.tables), 307 stop=StopWhen(OPTIONS.stop_at))