k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/kettle/stream.py (about) 1 #!/usr/bin/env python3 2 # Copyright 2017 The Kubernetes Authors. 3 # 4 # Licensed under the Apache License, Version 2.0 (the "License"); 5 # you may not use this file except in compliance with the License. 6 # You may obtain a copy of the License at 7 # 8 # http://www.apache.org/licenses/LICENSE-2.0 9 # 10 # Unless required by applicable law or agreed to in writing, software 11 # distributed under the License is distributed on an "AS IS" BASIS, 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 # See the License for the specific language governing permissions and 14 # limitations under the License. 15 16 """Receive push events for new builds and upload rows to BigQuery.""" 17 18 19 20 import argparse 21 import json 22 import os 23 import pprint 24 import socket 25 import sys 26 import traceback 27 import time 28 29 import multiprocessing.pool 30 import ruamel.yaml as yaml 31 32 try: 33 from google.api_core import exceptions as api_exceptions 34 from google.cloud import bigquery 35 from google.cloud import pubsub_v1 36 import google.cloud.exceptions 37 except ImportError: 38 print('WARNING: unable to load google cloud (test environment?)') 39 traceback.print_exc() 40 41 import model 42 import make_db 43 import make_json 44 45 46 MAX_ROW_UPLOAD = 10 # See https://github.com/googleapis/google-cloud-go/issues/2855 47 48 49 def should_exclude(object_id, bucket_id, buckets): 50 # Objects of form a/b/c/<jobname>/<hash>/<objectFile>' 51 if bucket_id not in buckets: 52 return False 53 return any(f'/{job}/' in object_id for job in buckets[bucket_id].get('exclude_jobs', [])) 54 55 56 def process_changes(results, buckets): 57 """Split GCS change events into trivial ack_ids and builds to further process.""" 58 ack_ids = [] # pubsub rec_message ids to acknowledge 59 todo = [] # (id, job, build) of builds to grab 60 # process results, find finished builds to process 61 for rec_message in results: 62 object_id = rec_message.message.attributes['objectId'] 63 bucket_id = rec_message.message.attributes['bucketId'] 64 exclude = should_exclude(object_id, bucket_id, buckets) 65 if not object_id.endswith('/finished.json') or exclude: 66 ack_ids.append(rec_message.ack_id) 67 continue 68 job, build = object_id[:-len('/finished.json')].rsplit('/', 1) 69 job = 'gs://%s/%s' % (bucket_id, job) 70 todo.append((rec_message.ack_id, job, build)) 71 return ack_ids, todo 72 73 74 def get_started_finished(gcs_client, db, todo): 75 """Download started/finished.json from build dirs in todo.""" 76 ack_ids = [] 77 build_dirs = [] 78 pool = multiprocessing.pool.ThreadPool(16) 79 try: 80 for ack_id, (build_dir, started, finished) in pool.imap_unordered( 81 lambda ack_id_job_build: (ack_id_job_build[0], gcs_client.get_started_finished( 82 ack_id_job_build[1], ack_id_job_build[2])), 83 todo): 84 if finished: 85 if not db.insert_build(build_dir, started, finished): 86 print('build dir already present in db: ', build_dir) 87 start = time.localtime(started.get('timestamp', 0) if started else 0) 88 print((build_dir, bool(started), bool(finished), 89 time.strftime('%F %T %Z', start), 90 finished and finished.get('result'))) 91 build_dirs.append(build_dir) 92 ack_ids.append(ack_id) 93 else: 94 print('finished.json missing?', build_dir, started, finished) 95 finally: 96 pool.close() 97 db.commit() 98 return ack_ids, build_dirs 99 100 101 def retry(func, *args, **kwargs): 102 """Run a function with arguments, retrying on server errors. """ 103 # pylint: disable=no-member 104 for attempt in range(20): 105 try: 106 return func(*args, **kwargs) 107 except (socket.error, google.cloud.exceptions.ServerError): 108 # retry with exponential backoff 109 traceback.print_exc() 110 time.sleep(1.4 ** attempt) 111 except api_exceptions.BadRequest as err: 112 args_size = sys.getsizeof(args) 113 kwargs_str = ','.join('{}={}'.format(k, v) for k, v in kwargs.items()) 114 print(f"Error running {func.__name__} \ 115 ([bytes in args]{args_size} with {kwargs_str}) : {str(err).encode('utf8')}") 116 return None # Skip 117 return func(*args, **kwargs) # one last attempt 118 119 120 def insert_data(bq_client, table, rows_iter): 121 """Upload rows from rows_iter into bigquery table table. 122 123 rows_iter should return a series of (row_id, row dictionary) tuples. 124 The row dictionary must match the table's schema. 125 126 Args: 127 bq_client: Client connection to BigQuery 128 table: bigquery.Table object that points to a specific table 129 rows_iter: row_id, dict representing a make_json.Build 130 Returns the row_ids that were inserted. 131 """ 132 def divide_chunks(l, bin_size=MAX_ROW_UPLOAD): 133 # break up rows to not hit data limits 134 for i in range(0, len(l), bin_size): 135 yield l[i:i + bin_size] 136 137 emitted, rows = [], [] 138 139 for row_id, build in rows_iter: 140 emitted.append(row_id) 141 rows.append(build) 142 143 if not rows: # nothing to do 144 return [] 145 146 for chunk in divide_chunks(rows): 147 # Insert rows with row_ids into table, retrying as necessary. 148 errors = retry(bq_client.insert_rows, table, chunk, skip_invalid_rows=True) 149 if not errors: 150 print(f'Loaded {len(chunk)} builds into {table.full_table_id}') 151 else: 152 print(f'Errors on Chunk: {chunk}') 153 pprint.pprint(errors) 154 pprint.pprint(table.schema) 155 156 return emitted 157 158 159 def main( 160 db, 161 subscriber, 162 subscription_path, 163 bq_client, 164 tables, 165 buckets, 166 client_class=make_db.GCSClient, 167 stop=None, 168 ): 169 # pylint: disable=too-many-locals 170 gcs_client = client_class('', {}) 171 if stop is None: 172 stop = lambda: False 173 174 results = [0] * 1000 # don't sleep on first loop 175 while not stop(): 176 print() 177 if len(results) < 10 and client_class is make_db.GCSClient: 178 time.sleep(5) # slow down! 179 180 print('====', time.strftime("%F %T %Z"), '=' * 40) 181 182 results = retry(subscriber.pull, subscription=subscription_path, max_messages=1000) 183 results = list(results.received_messages) 184 start = time.time() 185 while time.time() < start + 7: 186 results_more = list(subscriber.pull( 187 subscription=subscription_path, 188 max_messages=1000, 189 return_immediately=True).received_messages) 190 if not results_more: 191 break 192 results.extend(results_more) 193 194 print('PULLED', len(results)) 195 196 ack_ids, todo = process_changes(results, buckets) 197 198 if ack_ids: 199 print('ACK irrelevant', len(ack_ids)) 200 for n in range(0, len(ack_ids), 1000): 201 retry( 202 subscriber.acknowledge, 203 subscription=subscription_path, 204 ack_ids=ack_ids[n: n + 1000]) 205 206 if todo: 207 print('EXTEND-ACK ', len(todo)) 208 # give 3 minutes to grab build details 209 retry( 210 subscriber.modify_ack_deadline, 211 subscription=subscription_path, 212 ack_ids=[i for i, _j, _b in todo], 213 ack_deadline_seconds=60*3) 214 215 ack_ids, build_dirs = get_started_finished(gcs_client, db, todo) 216 217 # notify pubsub queue that we've handled the finished.json messages 218 if ack_ids: 219 print('ACK "finished.json"', len(ack_ids)) 220 retry(subscriber.acknowledge, subscription=subscription_path, ack_ids=ack_ids) 221 222 # grab junit files for new builds 223 make_db.download_junit(db, 16, client_class) 224 225 # stream new rows to tables 226 if build_dirs and tables: 227 for table, incremental_table in tables.values(): 228 builds = db.get_builds_from_paths(build_dirs, incremental_table) 229 emitted = insert_data(bq_client, table, make_json.make_rows(db, builds)) 230 db.insert_emitted(emitted, incremental_table) 231 232 233 def load_sub(poll): 234 """Return the PubSub subscription specified by the /-separated input. 235 236 Args: 237 poll: Follow GCS changes from project/topic/subscription 238 Ex: kubernetes-jenkins/gcs-changes/kettle 239 240 Return: 241 Subscribed client 242 """ 243 subscriber = pubsub_v1.SubscriberClient() 244 project_id, _, sub = poll.split('/') 245 subscription_path = f'projects/{project_id}/subscriptions/{sub}' 246 return subscriber, subscription_path 247 248 249 def load_schema(schemafield): 250 """Construct the expected BigQuery schema from files on disk. 251 252 Only used for new tables.""" 253 basedir = os.path.dirname(__file__) 254 with open(os.path.join(basedir, 'schema.json')) as json_file: 255 schema_json = json.load(json_file) 256 def make_field(spec): 257 spec['field_type'] = spec.pop('type') 258 if 'fields' in spec: 259 spec['fields'] = [make_field(f) for f in spec['fields']] 260 return schemafield(**spec) 261 return [make_field(s) for s in schema_json] 262 263 264 def load_tables(dataset, tablespecs): 265 """Construct a dictionary of BigQuery tables given the input tablespec. 266 267 Args: 268 dataset: bigquery.Dataset 269 tablespecs: list of strings of "NAME:DAYS", e.g. ["day:1"] 270 Returns: 271 client, {name: (bigquery.Table, incremental table name)} 272 """ 273 project, dataset_name = dataset.split(':') 274 bq_client = bigquery.Client(project) 275 276 tables = {} 277 for spec in tablespecs: 278 table_name, days = spec.split(':') 279 table_ref = f'{project}.{dataset_name}.{table_name}' 280 try: 281 table = bq_client.get_table(table_ref) # pylint: disable=no-member 282 except google.cloud.exceptions.NotFound: 283 table = bq_client.create_table(table_ref) # pylint: disable=no-member 284 table.schema = load_schema(bigquery.schema.SchemaField) 285 tables[table_name] = (table, make_json.get_table(float(days))) 286 return bq_client, tables 287 288 289 class StopWhen: 290 """A simple object that returns True once when the given hour begins.""" 291 def __init__(self, target, clock=lambda: time.localtime().tm_hour): 292 self.clock = clock 293 self.last = self.clock() 294 self.target = target 295 296 def __call__(self): 297 if os.path.exists('stop'): 298 return True 299 now = self.clock() 300 last = self.last 301 self.last = now 302 return now != last and now == self.target 303 304 305 def _make_bucket_map(path): 306 bucket_map = yaml.safe_load(open(path)) 307 bucket_to_attrs = dict() 308 for k, v in bucket_map.items(): 309 bucket = k.rsplit('/')[2] # of form gs://<bucket>/... 310 bucket_to_attrs[bucket] = v 311 return bucket_to_attrs 312 313 def get_options(argv): 314 """Process command line arguments.""" 315 parser = argparse.ArgumentParser() 316 parser.add_argument( 317 '--poll', 318 required=True, 319 help='Follow GCS changes from project/topic/subscription', 320 ) 321 parser.add_argument( 322 '--dataset', 323 help='BigQuery dataset (e.g. kubernetes-public:k8s_infra_kettle)' 324 ) 325 parser.add_argument( 326 '--tables', 327 nargs='+', 328 default=[], 329 help='Upload rows to table:days [e.g. --tables day:1 week:7 all:0]', 330 ) 331 parser.add_argument( 332 '--stop_at', 333 type=int, 334 help='Terminate when this hour (0-23) rolls around (in local time).' 335 ) 336 parser.add_argument( 337 '--buckets', 338 type=str, 339 default='buckets.yaml', 340 help='Path to bucket configuration.' 341 ) 342 return parser.parse_args(argv) 343 344 345 if __name__ == '__main__': 346 OPTIONS = get_options(sys.argv[1:]) 347 main(model.Database(), 348 *load_sub(OPTIONS.poll), 349 *load_tables(OPTIONS.dataset, OPTIONS.tables), 350 _make_bucket_map(OPTIONS.buckets), 351 stop=StopWhen(OPTIONS.stop_at))