k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/kettle/stream.py (about)

     1  #!/usr/bin/env python3
     2  # Copyright 2017 The Kubernetes Authors.
     3  #
     4  # Licensed under the Apache License, Version 2.0 (the "License");
     5  # you may not use this file except in compliance with the License.
     6  # You may obtain a copy of the License at
     7  #
     8  #     http://www.apache.org/licenses/LICENSE-2.0
     9  #
    10  # Unless required by applicable law or agreed to in writing, software
    11  # distributed under the License is distributed on an "AS IS" BASIS,
    12  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  # See the License for the specific language governing permissions and
    14  # limitations under the License.
    15  
    16  """Receive push events for new builds and upload rows to BigQuery."""
    17  
    18  
    19  
    20  import argparse
    21  import json
    22  import os
    23  import pprint
    24  import socket
    25  import sys
    26  import traceback
    27  import time
    28  
    29  import multiprocessing.pool
    30  import ruamel.yaml as yaml
    31  
    32  try:
    33      from google.api_core import exceptions as api_exceptions
    34      from google.cloud import bigquery
    35      from google.cloud import pubsub_v1
    36      import google.cloud.exceptions
    37  except ImportError:
    38      print('WARNING: unable to load google cloud (test environment?)')
    39      traceback.print_exc()
    40  
    41  import model
    42  import make_db
    43  import make_json
    44  
    45  
    46  MAX_ROW_UPLOAD = 10 # See https://github.com/googleapis/google-cloud-go/issues/2855
    47  
    48  
    49  def should_exclude(object_id, bucket_id, buckets):
    50      # Objects of form a/b/c/<jobname>/<hash>/<objectFile>'
    51      if bucket_id not in buckets:
    52          return False
    53      return any(f'/{job}/' in object_id for job in buckets[bucket_id].get('exclude_jobs', []))
    54  
    55  
    56  def process_changes(results, buckets):
    57      """Split GCS change events into trivial ack_ids and builds to further process."""
    58      ack_ids = []  # pubsub rec_message ids to acknowledge
    59      todo = []  # (id, job, build) of builds to grab
    60      # process results, find finished builds to process
    61      for rec_message in results:
    62          object_id = rec_message.message.attributes['objectId']
    63          bucket_id = rec_message.message.attributes['bucketId']
    64          exclude = should_exclude(object_id, bucket_id, buckets)
    65          if not object_id.endswith('/finished.json') or exclude:
    66              ack_ids.append(rec_message.ack_id)
    67              continue
    68          job, build = object_id[:-len('/finished.json')].rsplit('/', 1)
    69          job = 'gs://%s/%s' % (bucket_id, job)
    70          todo.append((rec_message.ack_id, job, build))
    71      return ack_ids, todo
    72  
    73  
    74  def get_started_finished(gcs_client, db, todo):
    75      """Download started/finished.json from build dirs in todo."""
    76      ack_ids = []
    77      build_dirs = []
    78      pool = multiprocessing.pool.ThreadPool(16)
    79      try:
    80          for ack_id, (build_dir, started, finished) in pool.imap_unordered(
    81                  lambda ack_id_job_build: (ack_id_job_build[0], gcs_client.get_started_finished(
    82                      ack_id_job_build[1], ack_id_job_build[2])),
    83                  todo):
    84              if finished:
    85                  if not db.insert_build(build_dir, started, finished):
    86                      print('build dir already present in db: ', build_dir)
    87                  start = time.localtime(started.get('timestamp', 0) if started else 0)
    88                  print((build_dir, bool(started), bool(finished),
    89                         time.strftime('%F %T %Z', start),
    90                         finished and finished.get('result')))
    91                  build_dirs.append(build_dir)
    92                  ack_ids.append(ack_id)
    93              else:
    94                  print('finished.json missing?', build_dir, started, finished)
    95      finally:
    96          pool.close()
    97      db.commit()
    98      return ack_ids, build_dirs
    99  
   100  
   101  def retry(func, *args, **kwargs):
   102      """Run a function with arguments, retrying on server errors. """
   103      # pylint: disable=no-member
   104      for attempt in range(20):
   105          try:
   106              return func(*args, **kwargs)
   107          except (socket.error, google.cloud.exceptions.ServerError):
   108              # retry with exponential backoff
   109              traceback.print_exc()
   110              time.sleep(1.4 ** attempt)
   111          except api_exceptions.BadRequest as err:
   112              args_size = sys.getsizeof(args)
   113              kwargs_str = ','.join('{}={}'.format(k, v) for k, v in kwargs.items())
   114              print(f"Error running {func.__name__} \
   115                     ([bytes in args]{args_size} with {kwargs_str}) : {str(err).encode('utf8')}")
   116              return None # Skip
   117      return func(*args, **kwargs)  # one last attempt
   118  
   119  
   120  def insert_data(bq_client, table, rows_iter):
   121      """Upload rows from rows_iter into bigquery table table.
   122  
   123      rows_iter should return a series of (row_id, row dictionary) tuples.
   124      The row dictionary must match the table's schema.
   125  
   126      Args:
   127          bq_client: Client connection to BigQuery
   128          table: bigquery.Table object that points to a specific table
   129          rows_iter: row_id, dict representing a make_json.Build
   130      Returns the row_ids that were inserted.
   131      """
   132      def divide_chunks(l, bin_size=MAX_ROW_UPLOAD):
   133          # break up rows to not hit data limits
   134          for i in range(0, len(l), bin_size):
   135              yield l[i:i + bin_size]
   136  
   137      emitted, rows = [], []
   138  
   139      for row_id, build in rows_iter:
   140          emitted.append(row_id)
   141          rows.append(build)
   142  
   143      if not rows:  # nothing to do
   144          return []
   145  
   146      for chunk in divide_chunks(rows):
   147          # Insert rows with row_ids into table, retrying as necessary.
   148          errors = retry(bq_client.insert_rows, table, chunk, skip_invalid_rows=True)
   149          if not errors:
   150              print(f'Loaded {len(chunk)} builds into {table.full_table_id}')
   151          else:
   152              print(f'Errors on Chunk: {chunk}')
   153              pprint.pprint(errors)
   154              pprint.pprint(table.schema)
   155  
   156      return emitted
   157  
   158  
   159  def main(
   160          db,
   161          subscriber,
   162          subscription_path,
   163          bq_client,
   164          tables,
   165          buckets,
   166          client_class=make_db.GCSClient,
   167          stop=None,
   168      ):
   169      # pylint: disable=too-many-locals
   170      gcs_client = client_class('', {})
   171      if stop is None:
   172          stop = lambda: False
   173  
   174      results = [0] * 1000  # don't sleep on first loop
   175      while not stop():
   176          print()
   177          if len(results) < 10 and client_class is make_db.GCSClient:
   178              time.sleep(5)  # slow down!
   179  
   180          print('====', time.strftime("%F %T %Z"), '=' * 40)
   181  
   182          results = retry(subscriber.pull, subscription=subscription_path, max_messages=1000)
   183          results = list(results.received_messages)
   184          start = time.time()
   185          while time.time() < start + 7:
   186              results_more = list(subscriber.pull(
   187                  subscription=subscription_path,
   188                  max_messages=1000,
   189                  return_immediately=True).received_messages)
   190              if not results_more:
   191                  break
   192              results.extend(results_more)
   193  
   194          print('PULLED', len(results))
   195  
   196          ack_ids, todo = process_changes(results, buckets)
   197  
   198          if ack_ids:
   199              print('ACK irrelevant', len(ack_ids))
   200              for n in range(0, len(ack_ids), 1000):
   201                  retry(
   202                      subscriber.acknowledge,
   203                      subscription=subscription_path,
   204                      ack_ids=ack_ids[n: n + 1000])
   205  
   206          if todo:
   207              print('EXTEND-ACK ', len(todo))
   208              # give 3 minutes to grab build details
   209              retry(
   210                  subscriber.modify_ack_deadline,
   211                  subscription=subscription_path,
   212                  ack_ids=[i for i, _j, _b in todo],
   213                  ack_deadline_seconds=60*3)
   214  
   215          ack_ids, build_dirs = get_started_finished(gcs_client, db, todo)
   216  
   217          # notify pubsub queue that we've handled the finished.json messages
   218          if ack_ids:
   219              print('ACK "finished.json"', len(ack_ids))
   220              retry(subscriber.acknowledge, subscription=subscription_path, ack_ids=ack_ids)
   221  
   222          # grab junit files for new builds
   223          make_db.download_junit(db, 16, client_class)
   224  
   225          # stream new rows to tables
   226          if build_dirs and tables:
   227              for table, incremental_table in tables.values():
   228                  builds = db.get_builds_from_paths(build_dirs, incremental_table)
   229                  emitted = insert_data(bq_client, table, make_json.make_rows(db, builds))
   230                  db.insert_emitted(emitted, incremental_table)
   231  
   232  
   233  def load_sub(poll):
   234      """Return the PubSub subscription specified by the /-separated input.
   235  
   236      Args:
   237          poll: Follow GCS changes from project/topic/subscription
   238                Ex: kubernetes-jenkins/gcs-changes/kettle
   239  
   240      Return:
   241          Subscribed client
   242      """
   243      subscriber = pubsub_v1.SubscriberClient()
   244      project_id, _, sub = poll.split('/')
   245      subscription_path = f'projects/{project_id}/subscriptions/{sub}'
   246      return subscriber, subscription_path
   247  
   248  
   249  def load_schema(schemafield):
   250      """Construct the expected BigQuery schema from files on disk.
   251  
   252      Only used for new tables."""
   253      basedir = os.path.dirname(__file__)
   254      with open(os.path.join(basedir, 'schema.json')) as json_file:
   255          schema_json = json.load(json_file)
   256      def make_field(spec):
   257          spec['field_type'] = spec.pop('type')
   258          if 'fields' in spec:
   259              spec['fields'] = [make_field(f) for f in spec['fields']]
   260          return schemafield(**spec)
   261      return [make_field(s) for s in schema_json]
   262  
   263  
   264  def load_tables(dataset, tablespecs):
   265      """Construct a dictionary of BigQuery tables given the input tablespec.
   266  
   267      Args:
   268          dataset: bigquery.Dataset
   269          tablespecs: list of strings of "NAME:DAYS", e.g. ["day:1"]
   270      Returns:
   271          client, {name: (bigquery.Table, incremental table name)}
   272      """
   273      project, dataset_name = dataset.split(':')
   274      bq_client = bigquery.Client(project)
   275  
   276      tables = {}
   277      for spec in tablespecs:
   278          table_name, days = spec.split(':')
   279          table_ref = f'{project}.{dataset_name}.{table_name}'
   280          try:
   281              table = bq_client.get_table(table_ref) # pylint: disable=no-member
   282          except google.cloud.exceptions.NotFound:
   283              table = bq_client.create_table(table_ref) # pylint: disable=no-member
   284              table.schema = load_schema(bigquery.schema.SchemaField)
   285          tables[table_name] = (table, make_json.get_table(float(days)))
   286      return bq_client, tables
   287  
   288  
   289  class StopWhen:
   290      """A simple object that returns True once when the given hour begins."""
   291      def __init__(self, target, clock=lambda: time.localtime().tm_hour):
   292          self.clock = clock
   293          self.last = self.clock()
   294          self.target = target
   295  
   296      def __call__(self):
   297          if os.path.exists('stop'):
   298              return True
   299          now = self.clock()
   300          last = self.last
   301          self.last = now
   302          return now != last and now == self.target
   303  
   304  
   305  def _make_bucket_map(path):
   306      bucket_map = yaml.safe_load(open(path))
   307      bucket_to_attrs = dict()
   308      for k, v in bucket_map.items():
   309          bucket = k.rsplit('/')[2] # of form gs://<bucket>/...
   310          bucket_to_attrs[bucket] = v
   311      return bucket_to_attrs
   312  
   313  def get_options(argv):
   314      """Process command line arguments."""
   315      parser = argparse.ArgumentParser()
   316      parser.add_argument(
   317          '--poll',
   318          required=True,
   319          help='Follow GCS changes from project/topic/subscription',
   320      )
   321      parser.add_argument(
   322          '--dataset',
   323          help='BigQuery dataset (e.g. kubernetes-public:k8s_infra_kettle)'
   324      )
   325      parser.add_argument(
   326          '--tables',
   327          nargs='+',
   328          default=[],
   329          help='Upload rows to table:days [e.g. --tables day:1 week:7 all:0]',
   330      )
   331      parser.add_argument(
   332          '--stop_at',
   333          type=int,
   334          help='Terminate when this hour (0-23) rolls around (in local time).'
   335      )
   336      parser.add_argument(
   337          '--buckets',
   338          type=str,
   339          default='buckets.yaml',
   340          help='Path to bucket configuration.'
   341      )
   342      return parser.parse_args(argv)
   343  
   344  
   345  if __name__ == '__main__':
   346      OPTIONS = get_options(sys.argv[1:])
   347      main(model.Database(),
   348           *load_sub(OPTIONS.poll),
   349           *load_tables(OPTIONS.dataset, OPTIONS.tables),
   350           _make_bucket_map(OPTIONS.buckets),
   351           stop=StopWhen(OPTIONS.stop_at))