github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/kettle/make_db.py (about) 1 # Copyright 2017 The Kubernetes Authors. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 15 """Generates a SQLite DB containing test data downloaded from GCS.""" 16 17 from __future__ import print_function 18 19 import argparse 20 import logging 21 import os 22 import random 23 import re 24 import signal 25 import sys 26 import time 27 import urllib2 28 from xml.etree import cElementTree as ET 29 30 import multiprocessing 31 import multiprocessing.pool 32 import requests 33 import yaml 34 35 import model 36 37 38 def pad_numbers(string): 39 """Modify a string to make its numbers suitable for natural sorting.""" 40 return re.sub(r'\d+', lambda m: m.group(0).rjust(16, '0'), string) 41 42 WORKER_CLIENT = None # used for multiprocessing 43 44 class GCSClient(object): 45 def __init__(self, jobs_dir, metadata=None): 46 self.jobs_dir = jobs_dir 47 self.metadata = metadata or {} 48 self.session = requests.Session() 49 50 def _request(self, path, params, as_json=True): 51 """GETs a JSON resource from GCS, with retries on failure. 52 53 Retries are based on guidance from 54 cloud.google.com/storage/docs/gsutil/addlhelp/RetryHandlingStrategy 55 56 """ 57 url = 'https://www.googleapis.com/storage/v1/b/%s' % path 58 for retry in xrange(23): 59 try: 60 resp = self.session.get(url, params=params, stream=False) 61 if 400 <= resp.status_code < 500 and resp.status_code != 429: 62 return None 63 resp.raise_for_status() 64 if as_json: 65 return resp.json() 66 return resp.content 67 except requests.exceptions.RequestException: 68 logging.exception('request failed %s', url) 69 time.sleep(random.random() * min(60, 2 ** retry)) 70 71 @staticmethod 72 def _parse_uri(path): 73 if not path.startswith('gs://'): 74 raise ValueError("Bad GCS path") 75 bucket, prefix = path[5:].split('/', 1) 76 return bucket, prefix 77 78 def get(self, path, as_json=False): 79 """Get an object from GCS.""" 80 bucket, path = self._parse_uri(path) 81 return self._request('%s/o/%s' % (bucket, urllib2.quote(path, '')), 82 {'alt': 'media'}, as_json=as_json) 83 84 def ls(self, path, dirs=True, files=True, delim=True, item_field='name'): 85 """Lists objects under a path on gcs.""" 86 # pylint: disable=invalid-name 87 88 bucket, path = self._parse_uri(path) 89 params = {'prefix': path, 'fields': 'nextPageToken'} 90 if delim: 91 params['delimiter'] = '/' 92 if dirs: 93 params['fields'] += ',prefixes' 94 if files: 95 params['fields'] += ',items(%s)' % item_field 96 while True: 97 resp = self._request('%s/o' % bucket, params) 98 if resp is None: # nothing under path? 99 return 100 for prefix in resp.get('prefixes', []): 101 yield 'gs://%s/%s' % (bucket, prefix) 102 for item in resp.get('items', []): 103 if item_field == 'name': 104 yield 'gs://%s/%s' % (bucket, item['name']) 105 else: 106 yield item[item_field] 107 if 'nextPageToken' not in resp: 108 break 109 params['pageToken'] = resp['nextPageToken'] 110 111 def ls_dirs(self, path): 112 return self.ls(path, dirs=True, files=False) 113 114 def _ls_junit_paths(self, build_dir): 115 """Lists the paths of JUnit XML files for a build.""" 116 url = '%sartifacts/' % (build_dir) 117 for path in self.ls(url): 118 if re.match(r'.*/junit.*\.xml$', path): 119 yield path 120 121 def get_junits_from_build(self, build_dir): 122 """Generates all tests for a build.""" 123 files = {} 124 assert not build_dir.endswith('/') 125 for junit_path in self._ls_junit_paths(build_dir + '/'): 126 files[junit_path] = self.get(junit_path) 127 return files 128 129 def _get_jobs(self): 130 """Generates all jobs in the bucket.""" 131 for job_path in self.ls_dirs(self.jobs_dir): 132 yield os.path.basename(os.path.dirname(job_path)) 133 134 def _get_builds(self, job): 135 '''Returns whether builds are precise (guarantees existence)''' 136 if self.metadata.get('sequential', True): 137 try: 138 latest_build = int(self.get('%s%s/latest-build.txt' 139 % (self.jobs_dir, job))) 140 except (ValueError, TypeError): 141 pass 142 else: 143 return False, (str(n) for n in xrange(latest_build, 0, -1)) 144 # Invalid latest-build or bucket is using timestamps 145 build_paths = self.ls_dirs('%s%s/' % (self.jobs_dir, job)) 146 return True, sorted( 147 (os.path.basename(os.path.dirname(b)) for b in build_paths), 148 key=pad_numbers, reverse=True) 149 150 def get_started_finished(self, job, build): 151 if self.metadata.get('pr'): 152 build_dir = self.get('%s/directory/%s/%s.txt' % (self.jobs_dir, job, build)).strip() 153 else: 154 build_dir = '%s%s/%s' % (self.jobs_dir, job, build) 155 started = self.get('%s/started.json' % build_dir, as_json=True) 156 finished = self.get('%s/finished.json' % build_dir, as_json=True) 157 return build_dir, started, finished 158 159 def get_builds(self, builds_have): 160 """Generates all (job, build) pairs ever.""" 161 if self.metadata.get('pr'): 162 files = self.ls(self.jobs_dir + '/directory/', delim=False) 163 for fname in files: 164 if fname.endswith('.txt') and 'latest-build' not in fname: 165 job, build = fname[:-4].split('/')[-2:] 166 if (job, build) in builds_have: 167 continue 168 yield job, build 169 return 170 for job in self._get_jobs(): 171 if job in ('pr-e2e-gce', 'maintenance-ci-testgrid-config-upload'): 172 continue # garbage. 173 have = 0 174 precise, builds = self._get_builds(job) 175 for build in builds: 176 if (job, build) in builds_have: 177 have += 1 178 if have > 40 and not precise: 179 break 180 continue 181 yield job, build 182 183 184 def mp_init_worker(jobs_dir, metadata, client_class, use_signal=True): 185 """ 186 Initialize the environment for multiprocessing-based multithreading. 187 """ 188 189 if use_signal: 190 signal.signal(signal.SIGINT, signal.SIG_IGN) 191 # Multiprocessing doesn't allow local variables for each worker, so we need 192 # to make a GCSClient global variable. 193 global WORKER_CLIENT # pylint: disable=global-statement 194 WORKER_CLIENT = client_class(jobs_dir, metadata) 195 196 def get_started_finished((job, build)): 197 try: 198 return WORKER_CLIENT.get_started_finished(job, build) 199 except: 200 logging.exception('failed to get tests for %s/%s', job, build) 201 raise 202 203 def get_junits((build_id, gcs_path)): 204 try: 205 junits = WORKER_CLIENT.get_junits_from_build(gcs_path) 206 return build_id, gcs_path, junits 207 except: 208 logging.exception('failed to get junits for %s', gcs_path) 209 raise 210 211 212 def get_builds(db, jobs_dir, metadata, threads, client_class): 213 """ 214 Adds information about tests to a dictionary. 215 216 Args: 217 jobs_dir: the GCS path containing jobs. 218 metadata: a dict of metadata about the jobs_dir. 219 threads: how many threads to use to download build information. 220 client_class: a constructor for a GCSClient (or a subclass). 221 """ 222 gcs = client_class(jobs_dir, metadata) 223 224 print('Loading builds from %s' % jobs_dir) 225 sys.stdout.flush() 226 227 builds_have = db.get_existing_builds(jobs_dir) 228 print('already have %d builds' % len(builds_have)) 229 sys.stdout.flush() 230 231 jobs_and_builds = gcs.get_builds(builds_have) 232 pool = None 233 if threads > 1: 234 pool = multiprocessing.Pool(threads, mp_init_worker, 235 (jobs_dir, metadata, client_class)) 236 builds_iterator = pool.imap_unordered( 237 get_started_finished, jobs_and_builds) 238 else: 239 global WORKER_CLIENT # pylint: disable=global-statement 240 WORKER_CLIENT = gcs 241 builds_iterator = ( 242 get_started_finished(job_build) for job_build in jobs_and_builds) 243 244 try: 245 for n, (build_dir, started, finished) in enumerate(builds_iterator): 246 print(build_dir) 247 if started or finished: 248 db.insert_build(build_dir, started, finished) 249 if n % 200 == 0: 250 db.commit() 251 except KeyboardInterrupt: 252 if pool: 253 pool.terminate() 254 raise 255 else: 256 if pool: 257 pool.close() 258 pool.join() 259 db.commit() 260 261 262 def remove_system_out(data): 263 """Strip bloated system-out annotations.""" 264 if 'system-out' in data: 265 try: 266 root = ET.fromstring(data) 267 for parent in root.findall('*//system-out/..'): 268 for child in parent.findall('system-out'): 269 parent.remove(child) 270 return ET.tostring(root) 271 except ET.ParseError: 272 pass 273 return data 274 275 276 def download_junit(db, threads, client_class): 277 """Download junit results for builds without them.""" 278 print("Downloading JUnit artifacts.") 279 sys.stdout.flush() 280 builds_to_grab = db.get_builds_missing_junit() 281 pool = None 282 if threads > 1: 283 pool = multiprocessing.pool.ThreadPool( 284 threads, mp_init_worker, ('', {}, client_class, False)) 285 test_iterator = pool.imap_unordered( 286 get_junits, builds_to_grab) 287 else: 288 global WORKER_CLIENT # pylint: disable=global-statement 289 WORKER_CLIENT = client_class('', {}) 290 test_iterator = ( 291 get_junits(build_path) for build_path in builds_to_grab) 292 for n, (build_id, build_path, junits) in enumerate(test_iterator, 1): 293 print('%d/%d' % (n, len(builds_to_grab)), 294 build_path, len(junits), len(''.join(junits.values()))) 295 junits = {k: remove_system_out(v) for k, v in junits.iteritems()} 296 297 db.insert_build_junits(build_id, junits) 298 if n % 100 == 0: 299 db.commit() 300 db.commit() 301 if pool: 302 pool.close() 303 pool.join() 304 305 306 def main(db, jobs_dirs, threads, get_junit, client_class=GCSClient): 307 """Collect test info in matching jobs.""" 308 get_builds(db, 'gs://kubernetes-jenkins/pr-logs', {'pr': True}, 309 threads, client_class) 310 for bucket, metadata in jobs_dirs.iteritems(): 311 if not bucket.endswith('/'): 312 bucket += '/' 313 get_builds(db, bucket, metadata, threads, client_class) 314 if get_junit: 315 download_junit(db, threads, client_class) 316 317 318 def get_options(argv): 319 """Process command line arguments.""" 320 parser = argparse.ArgumentParser() 321 parser.add_argument( 322 '--buckets', 323 help='YAML file with GCS bucket locations', 324 required=True, 325 ) 326 parser.add_argument( 327 '--threads', 328 help='number of concurrent threads to download results with', 329 default=32, 330 type=int, 331 ) 332 parser.add_argument( 333 '--junit', 334 action='store_true', 335 help='Download JUnit results from each build' 336 ) 337 return parser.parse_args(argv) 338 339 340 if __name__ == '__main__': 341 OPTIONS = get_options(sys.argv[1:]) 342 main(model.Database(), 343 yaml.load(open(OPTIONS.buckets)), 344 OPTIONS.threads, 345 OPTIONS.junit)