github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/kettle/make_db.py

github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/kettle/make_db.py (about)

     1  # Copyright 2017 The Kubernetes Authors.
     2  #
     3  # Licensed under the Apache License, Version 2.0 (the "License");
     4  # you may not use this file except in compliance with the License.
     5  # You may obtain a copy of the License at
     6  #
     7  #     http://www.apache.org/licenses/LICENSE-2.0
     8  #
     9  # Unless required by applicable law or agreed to in writing, software
    10  # distributed under the License is distributed on an "AS IS" BASIS,
    11  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  # See the License for the specific language governing permissions and
    13  # limitations under the License.
    14  
    15  """Generates a SQLite DB containing test data downloaded from GCS."""
    16  
    17  from __future__ import print_function
    18  
    19  import argparse
    20  import logging
    21  import os
    22  import random
    23  import re
    24  import signal
    25  import sys
    26  import time
    27  import urllib2
    28  from xml.etree import cElementTree as ET
    29  
    30  import multiprocessing
    31  import multiprocessing.pool
    32  import requests
    33  import yaml
    34  
    35  import model
    36  
    37  
    38  def pad_numbers(string):
    39      """Modify a string to make its numbers suitable for natural sorting."""
    40      return re.sub(r'\d+', lambda m: m.group(0).rjust(16, '0'), string)
    41  
    42  WORKER_CLIENT = None  # used for multiprocessing
    43  
    44  class GCSClient(object):
    45      def __init__(self, jobs_dir, metadata=None):
    46          self.jobs_dir = jobs_dir
    47          self.metadata = metadata or {}
    48          self.session = requests.Session()
    49  
    50      def _request(self, path, params, as_json=True):
    51          """GETs a JSON resource from GCS, with retries on failure.
    52  
    53          Retries are based on guidance from
    54          cloud.google.com/storage/docs/gsutil/addlhelp/RetryHandlingStrategy
    55  
    56          """
    57          url = 'https://www.googleapis.com/storage/v1/b/%s' % path
    58          for retry in xrange(23):
    59              try:
    60                  resp = self.session.get(url, params=params, stream=False)
    61                  if 400 <= resp.status_code < 500 and resp.status_code != 429:
    62                      return None
    63                  resp.raise_for_status()
    64                  if as_json:
    65                      return resp.json()
    66                  return resp.content
    67              except requests.exceptions.RequestException:
    68                  logging.exception('request failed %s', url)
    69              time.sleep(random.random() * min(60, 2 ** retry))
    70  
    71      @staticmethod
    72      def _parse_uri(path):
    73          if not path.startswith('gs://'):
    74              raise ValueError("Bad GCS path")
    75          bucket, prefix = path[5:].split('/', 1)
    76          return bucket, prefix
    77  
    78      def get(self, path, as_json=False):
    79          """Get an object from GCS."""
    80          bucket, path = self._parse_uri(path)
    81          return self._request('%s/o/%s' % (bucket, urllib2.quote(path, '')),
    82                               {'alt': 'media'}, as_json=as_json)
    83  
    84      def ls(self, path, dirs=True, files=True, delim=True, item_field='name'):
    85          """Lists objects under a path on gcs."""
    86          # pylint: disable=invalid-name
    87  
    88          bucket, path = self._parse_uri(path)
    89          params = {'prefix': path, 'fields': 'nextPageToken'}
    90          if delim:
    91              params['delimiter'] = '/'
    92              if dirs:
    93                  params['fields'] += ',prefixes'
    94          if files:
    95              params['fields'] += ',items(%s)' % item_field
    96          while True:
    97              resp = self._request('%s/o' % bucket, params)
    98              if resp is None:  # nothing under path?
    99                  return
   100              for prefix in resp.get('prefixes', []):
   101                  yield 'gs://%s/%s' % (bucket, prefix)
   102              for item in resp.get('items', []):
   103                  if item_field == 'name':
   104                      yield 'gs://%s/%s' % (bucket, item['name'])
   105                  else:
   106                      yield item[item_field]
   107              if 'nextPageToken' not in resp:
   108                  break
   109              params['pageToken'] = resp['nextPageToken']
   110  
   111      def ls_dirs(self, path):
   112          return self.ls(path, dirs=True, files=False)
   113  
   114      def _ls_junit_paths(self, build_dir):
   115          """Lists the paths of JUnit XML files for a build."""
   116          url = '%sartifacts/' % (build_dir)
   117          for path in self.ls(url):
   118              if re.match(r'.*/junit.*\.xml$', path):
   119                  yield path
   120  
   121      def get_junits_from_build(self, build_dir):
   122          """Generates all tests for a build."""
   123          files = {}
   124          assert not build_dir.endswith('/')
   125          for junit_path in self._ls_junit_paths(build_dir + '/'):
   126              files[junit_path] = self.get(junit_path)
   127          return files
   128  
   129      def _get_jobs(self):
   130          """Generates all jobs in the bucket."""
   131          for job_path in self.ls_dirs(self.jobs_dir):
   132              yield os.path.basename(os.path.dirname(job_path))
   133  
   134      def _get_builds(self, job):
   135          '''Returns whether builds are precise (guarantees existence)'''
   136          if self.metadata.get('sequential', True):
   137              try:
   138                  latest_build = int(self.get('%s%s/latest-build.txt'
   139                                              % (self.jobs_dir, job)))
   140              except (ValueError, TypeError):
   141                  pass
   142              else:
   143                  return False, (str(n) for n in xrange(latest_build, 0, -1))
   144          # Invalid latest-build or bucket is using timestamps
   145          build_paths = self.ls_dirs('%s%s/' % (self.jobs_dir, job))
   146          return True, sorted(
   147              (os.path.basename(os.path.dirname(b)) for b in build_paths),
   148              key=pad_numbers, reverse=True)
   149  
   150      def get_started_finished(self, job, build):
   151          if self.metadata.get('pr'):
   152              build_dir = self.get('%s/directory/%s/%s.txt' % (self.jobs_dir, job, build)).strip()
   153          else:
   154              build_dir = '%s%s/%s' % (self.jobs_dir, job, build)
   155          started = self.get('%s/started.json' % build_dir, as_json=True)
   156          finished = self.get('%s/finished.json' % build_dir, as_json=True)
   157          return build_dir, started, finished
   158  
   159      def get_builds(self, builds_have):
   160          """Generates all (job, build) pairs ever."""
   161          if self.metadata.get('pr'):
   162              files = self.ls(self.jobs_dir + '/directory/', delim=False)
   163              for fname in files:
   164                  if fname.endswith('.txt') and 'latest-build' not in fname:
   165                      job, build = fname[:-4].split('/')[-2:]
   166                      if (job, build) in builds_have:
   167                          continue
   168                      yield job, build
   169              return
   170          for job in self._get_jobs():
   171              if job in ('pr-e2e-gce', 'maintenance-ci-testgrid-config-upload'):
   172                  continue  # garbage.
   173              have = 0
   174              precise, builds = self._get_builds(job)
   175              for build in builds:
   176                  if (job, build) in builds_have:
   177                      have += 1
   178                      if have > 40 and not precise:
   179                          break
   180                      continue
   181                  yield job, build
   182  
   183  
   184  def mp_init_worker(jobs_dir, metadata, client_class, use_signal=True):
   185      """
   186      Initialize the environment for multiprocessing-based multithreading.
   187      """
   188  
   189      if use_signal:
   190          signal.signal(signal.SIGINT, signal.SIG_IGN)
   191      # Multiprocessing doesn't allow local variables for each worker, so we need
   192      # to make a GCSClient global variable.
   193      global WORKER_CLIENT  # pylint: disable=global-statement
   194      WORKER_CLIENT = client_class(jobs_dir, metadata)
   195  
   196  def get_started_finished((job, build)):
   197      try:
   198          return WORKER_CLIENT.get_started_finished(job, build)
   199      except:
   200          logging.exception('failed to get tests for %s/%s', job, build)
   201          raise
   202  
   203  def get_junits((build_id, gcs_path)):
   204      try:
   205          junits = WORKER_CLIENT.get_junits_from_build(gcs_path)
   206          return build_id, gcs_path, junits
   207      except:
   208          logging.exception('failed to get junits for %s', gcs_path)
   209          raise
   210  
   211  
   212  def get_builds(db, jobs_dir, metadata, threads, client_class):
   213      """
   214      Adds information about tests to a dictionary.
   215  
   216      Args:
   217          jobs_dir: the GCS path containing jobs.
   218          metadata: a dict of metadata about the jobs_dir.
   219          threads: how many threads to use to download build information.
   220          client_class: a constructor for a GCSClient (or a subclass).
   221      """
   222      gcs = client_class(jobs_dir, metadata)
   223  
   224      print('Loading builds from %s' % jobs_dir)
   225      sys.stdout.flush()
   226  
   227      builds_have = db.get_existing_builds(jobs_dir)
   228      print('already have %d builds' % len(builds_have))
   229      sys.stdout.flush()
   230  
   231      jobs_and_builds = gcs.get_builds(builds_have)
   232      pool = None
   233      if threads > 1:
   234          pool = multiprocessing.Pool(threads, mp_init_worker,
   235                                      (jobs_dir, metadata, client_class))
   236          builds_iterator = pool.imap_unordered(
   237              get_started_finished, jobs_and_builds)
   238      else:
   239          global WORKER_CLIENT  # pylint: disable=global-statement
   240          WORKER_CLIENT = gcs
   241          builds_iterator = (
   242              get_started_finished(job_build) for job_build in jobs_and_builds)
   243  
   244      try:
   245          for n, (build_dir, started, finished) in enumerate(builds_iterator):
   246              print(build_dir)
   247              if started or finished:
   248                  db.insert_build(build_dir, started, finished)
   249              if n % 200 == 0:
   250                  db.commit()
   251      except KeyboardInterrupt:
   252          if pool:
   253              pool.terminate()
   254          raise
   255      else:
   256          if pool:
   257              pool.close()
   258              pool.join()
   259      db.commit()
   260  
   261  
   262  def remove_system_out(data):
   263      """Strip bloated system-out annotations."""
   264      if 'system-out' in data:
   265          try:
   266              root = ET.fromstring(data)
   267              for parent in root.findall('*//system-out/..'):
   268                  for child in parent.findall('system-out'):
   269                      parent.remove(child)
   270              return ET.tostring(root)
   271          except ET.ParseError:
   272              pass
   273      return data
   274  
   275  
   276  def download_junit(db, threads, client_class):
   277      """Download junit results for builds without them."""
   278      print("Downloading JUnit artifacts.")
   279      sys.stdout.flush()
   280      builds_to_grab = db.get_builds_missing_junit()
   281      pool = None
   282      if threads > 1:
   283          pool = multiprocessing.pool.ThreadPool(
   284              threads, mp_init_worker, ('', {}, client_class, False))
   285          test_iterator = pool.imap_unordered(
   286              get_junits, builds_to_grab)
   287      else:
   288          global WORKER_CLIENT  # pylint: disable=global-statement
   289          WORKER_CLIENT = client_class('', {})
   290          test_iterator = (
   291              get_junits(build_path) for build_path in builds_to_grab)
   292      for n, (build_id, build_path, junits) in enumerate(test_iterator, 1):
   293          print('%d/%d' % (n, len(builds_to_grab)),
   294                build_path, len(junits), len(''.join(junits.values())))
   295          junits = {k: remove_system_out(v) for k, v in junits.iteritems()}
   296  
   297          db.insert_build_junits(build_id, junits)
   298          if n % 100 == 0:
   299              db.commit()
   300      db.commit()
   301      if pool:
   302          pool.close()
   303          pool.join()
   304  
   305  
   306  def main(db, jobs_dirs, threads, get_junit, client_class=GCSClient):
   307      """Collect test info in matching jobs."""
   308      get_builds(db, 'gs://kubernetes-jenkins/pr-logs', {'pr': True},
   309                 threads, client_class)
   310      for bucket, metadata in jobs_dirs.iteritems():
   311          if not bucket.endswith('/'):
   312              bucket += '/'
   313          get_builds(db, bucket, metadata, threads, client_class)
   314      if get_junit:
   315          download_junit(db, threads, client_class)
   316  
   317  
   318  def get_options(argv):
   319      """Process command line arguments."""
   320      parser = argparse.ArgumentParser()
   321      parser.add_argument(
   322          '--buckets',
   323          help='YAML file with GCS bucket locations',
   324          required=True,
   325      )
   326      parser.add_argument(
   327          '--threads',
   328          help='number of concurrent threads to download results with',
   329          default=32,
   330          type=int,
   331      )
   332      parser.add_argument(
   333          '--junit',
   334          action='store_true',
   335          help='Download JUnit results from each build'
   336      )
   337      return parser.parse_args(argv)
   338  
   339  
   340  if __name__ == '__main__':
   341      OPTIONS = get_options(sys.argv[1:])
   342      main(model.Database(),
   343           yaml.load(open(OPTIONS.buckets)),
   344           OPTIONS.threads,
   345           OPTIONS.junit)