github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/kettle/stream.py (about)

     1  #!/usr/bin/env python
     2  # Copyright 2017 The Kubernetes Authors.
     3  #
     4  # Licensed under the Apache License, Version 2.0 (the "License");
     5  # you may not use this file except in compliance with the License.
     6  # You may obtain a copy of the License at
     7  #
     8  #     http://www.apache.org/licenses/LICENSE-2.0
     9  #
    10  # Unless required by applicable law or agreed to in writing, software
    11  # distributed under the License is distributed on an "AS IS" BASIS,
    12  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  # See the License for the specific language governing permissions and
    14  # limitations under the License.
    15  
    16  """Receive push events for new builds and upload rows to BigQuery."""
    17  
    18  from __future__ import print_function
    19  
    20  import argparse
    21  import json
    22  import os
    23  import pprint
    24  import socket
    25  import sys
    26  import traceback
    27  import time
    28  
    29  import multiprocessing.pool
    30  
    31  try:
    32      from google.cloud import bigquery
    33      from google.cloud import pubsub
    34      import google.cloud.exceptions
    35  except ImportError:
    36      print('WARNING: unable to load google cloud (test environment?)')
    37      traceback.print_exc()
    38  
    39  import model
    40  import make_db
    41  import make_json
    42  
    43  
    44  def process_changes(results):
    45      """Split GCS change events into trivial acks and builds to further process."""
    46      acks = []  # pubsub message ids to acknowledge
    47      todo = []  # (id, job, build) of builds to grab
    48  
    49      # process results, find finished builds to process
    50      for ack_id, message in results:
    51          if message.attributes['eventType'] != 'OBJECT_FINALIZE':
    52              acks.append(ack_id)
    53              continue
    54          obj = message.attributes['objectId']
    55          if not obj.endswith('/finished.json'):
    56              acks.append(ack_id)
    57              continue
    58          job, build = obj[:-len('/finished.json')].rsplit('/', 1)
    59          job = 'gs://%s/%s' % (message.attributes['bucketId'], job)
    60          todo.append((ack_id, job, build))
    61  
    62      return acks, todo
    63  
    64  
    65  def get_started_finished(gcs_client, db, todo):
    66      """Download started/finished.json from build dirs in todo."""
    67      acks = []
    68      build_dirs = []
    69      pool = multiprocessing.pool.ThreadPool(16)
    70      try:
    71          for ack_id, (build_dir, started, finished) in pool.imap_unordered(
    72                  lambda (ack_id, job, build): (ack_id, gcs_client.get_started_finished(job, build)),
    73                  todo):
    74              if finished:
    75                  if not db.insert_build(build_dir, started, finished):
    76                      print('already present??')
    77                  start = time.localtime(started.get('timestamp', 0) if started else 0)
    78                  print(build_dir, bool(started), bool(finished),
    79                        time.strftime('%F %T %Z', start),
    80                        finished and finished.get('result'))
    81                  build_dirs.append(build_dir)
    82                  acks.append(ack_id)
    83              else:
    84                  print('finished.json missing?', build_dir, started, finished)
    85      finally:
    86          pool.close()
    87      db.commit()
    88      return acks, build_dirs
    89  
    90  
    91  def row_to_mapping(row, schema):
    92      """Convert a dictionary to a list for bigquery.Table.insert_data.
    93  
    94      Silly. See https://github.com/GoogleCloudPlatform/google-cloud-python/issues/3396
    95      """
    96      return [row.get(field.name, [] if field.mode == 'REPEATED' else None) for field in schema]
    97  
    98  
    99  def retry(func, *args, **kwargs):
   100      """Run a function with arguments, retrying on server errors. """
   101      # pylint: disable=no-member
   102      for attempt in xrange(20):
   103          try:
   104              return func(*args, **kwargs)
   105          except (socket.error, google.cloud.exceptions.ServerError):
   106              # retry with exponential backoff
   107              traceback.print_exc()
   108              time.sleep(1.4 ** attempt)
   109      return func(*args, **kwargs)  # one last attempt
   110  
   111  
   112  def insert_data(table, rows_iter):
   113      """Upload rows from rows_iter into bigquery table table.
   114  
   115      rows_iter should return a series of (row_id, row dictionary) tuples.
   116      The row dictionary must match the table's schema.
   117  
   118      Returns the row_ids that were inserted.
   119      """
   120      emitted = set()
   121  
   122      rows = []
   123      row_ids = []
   124  
   125      for row_id, row in rows_iter:
   126          emitted.add(row_id)
   127          if len(json.dumps(row)) > 1e6:
   128              print('ERROR: row too long', row['path'])
   129              continue
   130          row = row_to_mapping(row, table.schema)
   131          rows.append(row)
   132          row_ids.append(row_id)
   133  
   134      if not rows:  # nothing to do
   135          return []
   136  
   137      def insert(table, rows, row_ids):
   138          """Insert rows with row_ids into table, retrying as necessary."""
   139          errors = retry(table.insert_data, rows, row_ids, skip_invalid_rows=True)
   140  
   141          if not errors:
   142              print('Loaded {} builds into {}'.format(len(rows), table.name))
   143          else:
   144              print('Errors:')
   145              pprint.pprint(errors)
   146              pprint.pprint(table.schema)
   147  
   148      if len(json.dumps(rows)) > 10e6:
   149          print('WARNING: too big for one insert, doing stupid slow version')
   150          for row, row_id in zip(rows, row_ids):
   151              insert(table, [row], [row_id])
   152      else:
   153          insert(table, rows, row_ids)
   154  
   155      return emitted
   156  
   157  
   158  def main(db, sub, tables, client_class=make_db.GCSClient, stop=None):
   159      # pylint: disable=too-many-locals
   160      gcs_client = client_class('', {})
   161  
   162      if stop is None:
   163          stop = lambda: False
   164  
   165      results = [0] * 1000  # don't sleep on first loop
   166      while not stop():
   167          print()
   168          if len(results) < 10 and client_class is make_db.GCSClient:
   169              time.sleep(5)  # slow down!
   170  
   171          print('====', time.strftime("%F %T %Z"), '=' * 40)
   172  
   173          results = retry(sub.pull, max_messages=1000)
   174          start = time.time()
   175          while time.time() < start + 7:
   176              results_more = sub.pull(max_messages=1000, return_immediately=True)
   177              if not results_more:
   178                  break
   179              results += results_more
   180  
   181          print('PULLED', len(results))
   182  
   183          acks, todo = process_changes(results)
   184  
   185          if acks:
   186              print('ACK irrelevant', len(acks))
   187              for n in xrange(0, len(acks), 1000):
   188                  retry(sub.acknowledge, acks[n: n + 1000])
   189  
   190          if todo:
   191              print('EXTEND-ACK ', len(todo))
   192              # give 3 minutes to grab build details
   193              retry(sub.modify_ack_deadline, [i for i, _j, _b in todo], 60*3)
   194  
   195          acks, build_dirs = get_started_finished(gcs_client, db, todo)
   196  
   197          # notify pubsub queue that we've handled the finished.json messages
   198          if acks:
   199              print('ACK "finished.json"', len(acks))
   200              retry(sub.acknowledge, acks)
   201  
   202          # grab junit files for new builds
   203          make_db.download_junit(db, 16, client_class)
   204  
   205          # stream new rows to tables
   206          if build_dirs and tables:
   207              for table, incremental_table in tables.itervalues():
   208                  builds = db.get_builds_from_paths(build_dirs, incremental_table)
   209                  emitted = insert_data(table, make_json.make_rows(db, builds))
   210                  db.insert_emitted(emitted, incremental_table)
   211  
   212  
   213  def load_sub(poll):
   214      """Return the PubSub subscription specified by the /-separated input."""
   215      project, topic, subscription = poll.split('/')
   216      pubsub_client = pubsub.Client(project)
   217      return pubsub_client.topic(topic).subscription(subscription)
   218  
   219  
   220  def load_schema(schemafield):
   221      """Construct the expected BigQuery schema from files on disk.
   222  
   223      Only used for new tables."""
   224      basedir = os.path.dirname(__file__)
   225      schema_json = json.load(open(os.path.join(basedir, 'schema.json')))
   226      def make_field(spec):
   227          spec['field_type'] = spec.pop('type')
   228          if 'fields' in spec:
   229              spec['fields'] = [make_field(f) for f in spec['fields']]
   230          return schemafield(**spec)
   231      return [make_field(s) for s in schema_json]
   232  
   233  
   234  def load_tables(dataset, tablespecs):
   235      """Construct a dictionary of BigQuery tables given the input tablespec.
   236  
   237      Args:
   238          dataset: bigquery.Dataset
   239          tablespecs: list of strings of "NAME:DAYS", e.g. ["day:1"]
   240      Returns:
   241          {name: (bigquery.Table, incremental table name)}
   242      """
   243      project, dataset_name = dataset.split(':')
   244      dataset = bigquery.Client(project).dataset(dataset_name)
   245  
   246      tables = {}
   247      for spec in tablespecs:
   248          name, days = spec.split(':')
   249          table = dataset.table(name)
   250          try:
   251              table.reload()
   252          except google.cloud.exceptions.NotFound:  # pylint: disable=no-member
   253              table.schema = load_schema(bigquery.schema.SchemaField)
   254              table.create()
   255          tables[name] = (table, make_json.get_table(float(days)))
   256      return tables
   257  
   258  
   259  class StopWhen(object):
   260      """A simple object that returns True once when the given hour begins."""
   261      def __init__(self, target, clock=lambda: time.localtime().tm_hour):
   262          self.clock = clock
   263          self.last = self.clock()
   264          self.target = target
   265  
   266      def __call__(self):
   267          if os.path.exists('stop'):
   268              return True
   269          now = self.clock()
   270          last = self.last
   271          self.last = now
   272          return now != last and now == self.target
   273  
   274  
   275  def get_options(argv):
   276      """Process command line arguments."""
   277      parser = argparse.ArgumentParser()
   278      parser.add_argument(
   279          '--poll',
   280          required=True,
   281          help='Follow GCS changes from project/topic/subscription',
   282      )
   283      parser.add_argument(
   284          '--dataset',
   285          help='BigQuery dataset (e.g. k8s-gubernator:build)'
   286      )
   287      parser.add_argument(
   288          '--tables',
   289          nargs='+',
   290          default=[],
   291          help='Upload rows to table:days [e.g. --tables day:1 week:7 all:0]',
   292      )
   293      parser.add_argument(
   294          '--stop_at',
   295          type=int,
   296          help='Terminate when this hour (0-23) rolls around (in local time).'
   297      )
   298      return parser.parse_args(argv)
   299  
   300  
   301  if __name__ == '__main__':
   302      OPTIONS = get_options(sys.argv[1:])
   303  
   304      main(model.Database(),
   305           load_sub(OPTIONS.poll),
   306           load_tables(OPTIONS.dataset, OPTIONS.tables),
   307           stop=StopWhen(OPTIONS.stop_at))