github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/benchmarks/nexmark/nexmark_launcher.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/benchmarks/nexmark/nexmark_launcher.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Nexmark launcher.
    19  
    20  The Nexmark suite is a series of queries (streaming pipelines) performed
    21  on a simulation of auction events. The launcher orchestrates the generation
    22  and parsing of streaming events and the running of queries.
    23  
    24  Model
    25    - Person: Author of an auction or a bid.
    26    - Auction: Item under auction.
    27    - Bid: A bid for an item under auction.
    28  
    29  Events
    30   - Create Person
    31   - Create Auction
    32   - Create Bid
    33  
    34  Queries
    35    - Query0: Pass through (send and receive auction events).
    36  
    37  Usage
    38    - DirectRunner
    39        python nexmark_launcher.py \
    40            --query/q <query number> \
    41            --project <project id> \
    42            --loglevel=DEBUG (optional) \
    43            --wait_until_finish_duration <time_in_ms> \
    44            --streaming
    45  
    46    - DataflowRunner
    47        python nexmark_launcher.py \
    48            --query/q <query number> \
    49            --project <project id> \
    50            --region <GCE region> \
    51            --loglevel=DEBUG (optional) \
    52            --wait_until_finish_duration <time_in_ms> \
    53            --streaming \
    54            --sdk_location <apache_beam tar.gz> \
    55            --staging_location=gs://... \
    56            --temp_location=gs://
    57  
    58  """
    59  
    60  # pytype: skip-file
    61  
    62  import argparse
    63  import json
    64  import logging
    65  import os
    66  import time
    67  import uuid
    68  
    69  import requests
    70  from requests.auth import HTTPBasicAuth
    71  
    72  import apache_beam as beam
    73  from apache_beam.options.pipeline_options import GoogleCloudOptions
    74  from apache_beam.options.pipeline_options import PipelineOptions
    75  from apache_beam.options.pipeline_options import SetupOptions
    76  from apache_beam.options.pipeline_options import StandardOptions
    77  from apache_beam.options.pipeline_options import TypeOptions
    78  from apache_beam.runners import PipelineState
    79  from apache_beam.testing.benchmarks.nexmark import nexmark_util
    80  from apache_beam.testing.benchmarks.nexmark.monitor import Monitor
    81  from apache_beam.testing.benchmarks.nexmark.monitor import MonitorSuffix
    82  from apache_beam.testing.benchmarks.nexmark.nexmark_perf import NexmarkPerf
    83  from apache_beam.testing.benchmarks.nexmark.queries import query0
    84  from apache_beam.testing.benchmarks.nexmark.queries import query1
    85  from apache_beam.testing.benchmarks.nexmark.queries import query2
    86  from apache_beam.testing.benchmarks.nexmark.queries import query3
    87  from apache_beam.testing.benchmarks.nexmark.queries import query4
    88  from apache_beam.testing.benchmarks.nexmark.queries import query5
    89  from apache_beam.testing.benchmarks.nexmark.queries import query6
    90  from apache_beam.testing.benchmarks.nexmark.queries import query7
    91  from apache_beam.testing.benchmarks.nexmark.queries import query8
    92  from apache_beam.testing.benchmarks.nexmark.queries import query9
    93  from apache_beam.testing.benchmarks.nexmark.queries import query10
    94  from apache_beam.testing.benchmarks.nexmark.queries import query11
    95  from apache_beam.testing.benchmarks.nexmark.queries import query12
    96  from apache_beam.transforms import window
    97  
    98  
    99  class NexmarkLauncher(object):
   100  
   101    # how long after some result is seen and no activity seen do we cancel job
   102    DONE_DELAY = 5 * 60
   103    # delay in seconds between sample perf data
   104    PERF_DELAY = 20
   105    # delay before cancelling the job when pipeline appears to be stuck
   106    TERMINATE_DELAY = 1 * 60 * 60
   107    # delay before warning when pipeline appears to be stuck
   108    WARNING_DELAY = 10 * 60
   109  
   110    def __init__(self):
   111      self.parse_args()
   112      self.manage_resources = self.args.manage_resources
   113      self.uuid = str(uuid.uuid4()) if self.manage_resources else ''
   114      self.topic_name = (
   115          self.args.topic_name + self.uuid if self.args.topic_name else None)
   116      self.subscription_name = (
   117          self.args.subscription_name +
   118          self.uuid if self.args.subscription_name else None)
   119      self.pubsub_mode = self.args.pubsub_mode
   120      if self.manage_resources:
   121        from google.cloud import pubsub
   122        self.cleanup()
   123        publish_client = pubsub.Client(project=self.project)
   124        topic = publish_client.topic(self.topic_name)
   125        logging.info('creating topic %s', self.topic_name)
   126        topic.create()
   127        sub = topic.subscription(self.subscription_name)
   128        logging.info('creating sub %s', self.topic_name)
   129        sub.create()
   130  
   131      self.export_influxdb = self.args.export_summary_to_influx_db
   132      if self.export_influxdb:
   133        self.influx_database = self.args.influx_database
   134        self.influx_host = self.args.influx_host
   135        self.influx_base = self.args.base_influx_measurement
   136        self.influx_retention = self.args.influx_retention_policy
   137  
   138    def parse_args(self):
   139      parser = argparse.ArgumentParser()
   140  
   141      parser.add_argument(
   142          '--query',
   143          '-q',
   144          type=int,
   145          action='append',
   146          required=True,
   147          choices=[i for i in range(13)],
   148          help='Query to run')
   149  
   150      parser.add_argument(
   151          '--subscription_name',
   152          type=str,
   153          help='Pub/Sub subscription to read from')
   154  
   155      parser.add_argument(
   156          '--topic_name', type=str, help='Pub/Sub topic to read from')
   157  
   158      parser.add_argument(
   159          '--loglevel',
   160          choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
   161          default='INFO',
   162          help='Set logging level to debug')
   163      parser.add_argument(
   164          '--input',
   165          type=str,
   166          help='Path to the data file containing nexmark events.')
   167      parser.add_argument(
   168          '--num_events',
   169          type=int,
   170          default=100000,
   171          help='number of events expected to process')
   172      parser.add_argument(
   173          '--manage_resources',
   174          default=False,
   175          action='store_true',
   176          help='If true, manage the creation and cleanup of topics and '
   177          'subscriptions.')
   178      parser.add_argument(
   179          '--pubsub_mode',
   180          type=str,
   181          default='SUBSCRIBE_ONLY',
   182          choices=['PUBLISH_ONLY', 'SUBSCRIBE_ONLY', 'COMBINED'],
   183          help='Pubsub mode used in the pipeline.')
   184  
   185      parser.add_argument(
   186          '--export_summary_to_influx_db',
   187          default=False,
   188          action='store_true',
   189          help='If set store results in influxdb')
   190      parser.add_argument(
   191          '--influx_database',
   192          type=str,
   193          default='beam_test_metrics',
   194          help='Influx database name')
   195      parser.add_argument(
   196          '--influx_host',
   197          type=str,
   198          default='http://localhost:8086',
   199          help='Influx database url')
   200      parser.add_argument(
   201          '--base_influx_measurement',
   202          type=str,
   203          default='nexmark',
   204          help='Prefix to influx measurement')
   205      parser.add_argument(
   206          '--influx_retention_policy',
   207          type=str,
   208          default='forever',
   209          help='Retention policy for stored results')
   210  
   211      self.args, self.pipeline_args = parser.parse_known_args()
   212      logging.basicConfig(
   213          level=getattr(logging, self.args.loglevel, None),
   214          format='(%(threadName)-10s) %(message)s')
   215  
   216      self.pipeline_options = PipelineOptions(self.pipeline_args)
   217      logging.debug('args, pipeline_args: %s, %s', self.args, self.pipeline_args)
   218  
   219      # Usage with Dataflow requires a project to be supplied.
   220      self.project = self.pipeline_options.view_as(GoogleCloudOptions).project
   221      self.streaming = self.pipeline_options.view_as(StandardOptions).streaming
   222      self.pipeline_options.view_as(TypeOptions).allow_unsafe_triggers = True
   223  
   224      if self.streaming:
   225        if self.args.subscription_name is None or self.project is None:
   226          raise ValueError(
   227              'argument --subscription_name and --project ' +
   228              'are required when running in streaming mode')
   229      else:
   230        if self.args.input is None:
   231          raise ValueError(
   232              'argument --input is required when running in batch mode')
   233  
   234      # We use the save_main_session option because one or more DoFn's in this
   235      # workflow rely on global context (e.g., a module imported at module level).
   236      self.pipeline_options.view_as(SetupOptions).save_main_session = True
   237  
   238    def generate_events(self):
   239      from google.cloud import pubsub
   240      publish_client = pubsub.Client(project=self.project)
   241      topic = publish_client.topic(self.topic_name)
   242  
   243      logging.info('Generating auction events to topic %s', topic.name)
   244  
   245      if self.args.input.startswith('gs://'):
   246        from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem
   247        fs = GCSFileSystem(self.pipeline_options)
   248        with fs.open(self.args.input) as infile:
   249          for line in infile:
   250            topic.publish(line)
   251      else:
   252        with open(self.args.input) as infile:
   253          for line in infile:
   254            topic.publish(line)
   255  
   256      logging.info('Finished event generation.')
   257  
   258    def read_from_file(self):
   259      return (
   260          self.pipeline
   261          | 'reading_from_file' >> beam.io.ReadFromText(self.args.input)
   262          | 'deserialization' >> beam.ParDo(nexmark_util.ParseJsonEventFn())
   263          | 'timestamping' >>
   264          beam.Map(lambda e: window.TimestampedValue(e, e.date_time)))
   265  
   266    def read_from_pubsub(self):
   267      # Read from PubSub into a PCollection.
   268      if self.subscription_name:
   269        raw_events = self.pipeline | 'ReadPubSub_sub' >> beam.io.ReadFromPubSub(
   270            subscription=self.subscription_name,
   271            with_attributes=True,
   272            timestamp_attribute='timestamp')
   273      else:
   274        raw_events = self.pipeline | 'ReadPubSub_topic' >> beam.io.ReadFromPubSub(
   275            topic=self.topic_name,
   276            with_attributes=True,
   277            timestamp_attribute='timestamp')
   278      events = (
   279          raw_events
   280          | 'pubsub_unwrap' >> beam.Map(lambda m: m.data)
   281          | 'deserialization' >> beam.ParDo(nexmark_util.ParseJsonEventFn()))
   282      return events
   283  
   284    def run_query(
   285        self, query_num, query, query_args, pipeline_options, query_errors):
   286      try:
   287        self.pipeline = beam.Pipeline(options=self.pipeline_options)
   288        nexmark_util.setup_coder()
   289  
   290        event_monitor = Monitor('.events', 'event')
   291        result_monitor = Monitor('.results', 'result')
   292  
   293        if self.streaming:
   294          if self.pubsub_mode != 'SUBSCRIBE_ONLY':
   295            self.generate_events()
   296          if self.pubsub_mode == 'PUBLISH_ONLY':
   297            return
   298          events = self.read_from_pubsub()
   299        else:
   300          events = self.read_from_file()
   301  
   302        events = events | 'event_monitor' >> beam.ParDo(event_monitor.doFn)
   303        output = query.load(events, query_args, pipeline_options)
   304        output | 'result_monitor' >> beam.ParDo(result_monitor.doFn)  # pylint: disable=expression-not-assigned
   305  
   306        result = self.pipeline.run()
   307        if not self.streaming:
   308          result.wait_until_finish()
   309        perf = self.monitor(result, event_monitor, result_monitor)
   310        self.log_performance(perf)
   311        if self.export_influxdb:
   312          self.publish_performance_influxdb(query_num, perf)
   313  
   314      except Exception as exc:
   315        query_errors.append(str(exc))
   316        raise
   317  
   318    def monitor(self, job, event_monitor, result_monitor):
   319      """
   320      keep monitoring the performance and progress of running job and cancel
   321      the job if the job is stuck or seems to have finished running
   322  
   323      Returns:
   324        the final performance if it is measured
   325      """
   326      logging.info('starting to monitor the job')
   327      last_active_ms = -1
   328      perf = None
   329      cancel_job = False
   330      waiting_for_shutdown = False
   331  
   332      while True:
   333        now = int(time.time() * 1000)  # current time in ms
   334        logging.debug('now is %d', now)
   335  
   336        curr_perf = NexmarkLauncher.get_performance(
   337            job, event_monitor, result_monitor)
   338        if perf is None or curr_perf.has_progress(perf):
   339          last_active_ms = now
   340  
   341        # only judge if the job should be cancelled if it is streaming job and
   342        # has not been shut down already
   343        if self.streaming and not waiting_for_shutdown:
   344          quiet_duration = (now - last_active_ms) // 1000
   345          if (curr_perf.event_count >= self.args.num_events and
   346              curr_perf.result_count >= 0 and quiet_duration > self.DONE_DELAY):
   347            # we think the job is finished if expected input count has been seen
   348            # and no new results have been produced for a while
   349            logging.info('streaming query appears to have finished executing')
   350            waiting_for_shutdown = True
   351            cancel_job = True
   352          elif quiet_duration > self.TERMINATE_DELAY:
   353            logging.error(
   354                'streaming query have been stuck for %d seconds', quiet_duration)
   355            logging.error('canceling streaming job')
   356            waiting_for_shutdown = True
   357            cancel_job = True
   358          elif quiet_duration > self.WARNING_DELAY:
   359            logging.warning(
   360                'streaming query have been stuck for %d seconds', quiet_duration)
   361  
   362          if cancel_job:
   363            job.cancel()
   364  
   365        perf = curr_perf
   366  
   367        stopped = PipelineState.is_terminal(job.state)
   368        if stopped:
   369          break
   370  
   371        if not waiting_for_shutdown:
   372          if last_active_ms == now:
   373            logging.info('activity seen, new performance data extracted')
   374          else:
   375            logging.info('no activity seen')
   376        else:
   377          logging.info('waiting for shutdown')
   378  
   379        time.sleep(self.PERF_DELAY)
   380  
   381      return perf
   382  
   383    @staticmethod
   384    def log_performance(perf):
   385      # type: (NexmarkPerf) -> None
   386      logging.info(
   387          'input event count: %d, output event count: %d' %
   388          (perf.event_count, perf.result_count))
   389      logging.info(
   390          'query run took %.1f seconds and processed %.1f events per second' %
   391          (perf.runtime_sec, perf.event_per_sec))
   392  
   393    def publish_performance_influxdb(self, query_num, perf):
   394      processingMode = "streaming" if self.streaming else "batch"
   395      measurement = "%s_%d_python_%s" % (
   396          self.influx_base, query_num, processingMode)
   397  
   398      tags = {'runner': self.pipeline_options.view_as(StandardOptions).runner}
   399  
   400      mt = ','.join([measurement] + [k + "=" + v for k, v in tags.items()])
   401  
   402      fields = {
   403          'numResults': "%di" % (perf.result_count),
   404          'runtimeMs': "%di" % (perf.runtime_sec * 1000),
   405      }
   406  
   407      ts = int(time.time())
   408      payload = '\n'.join(
   409          ["%s %s=%s %d" % (mt, k, v, ts) for k, v in fields.items()])
   410  
   411      url = '%s/write' % (self.influx_host)
   412      query_str = {
   413          'db': self.influx_database,
   414          'rp': self.influx_retention,
   415          'precision': 's',
   416      }
   417  
   418      user = os.getenv('INFLUXDB_USER')
   419      password = os.getenv('INFLUXDB_USER_PASSWORD')
   420      auth = HTTPBasicAuth(user, password)
   421  
   422      try:
   423        response = requests.post(url, params=query_str, data=payload, auth=auth)
   424      except requests.exceptions.RequestException as e:
   425        logging.warning('Failed to publish metrics to InfluxDB: ' + str(e))
   426      else:
   427        if response.status_code != 204:
   428          content = json.loads(response.content)
   429          logging.warning(
   430              'Failed to publish metrics to InfluxDB. Received status code %s '
   431              'with an error message: %s' %
   432              (response.status_code, content['error']))
   433  
   434    @staticmethod
   435    def get_performance(result, event_monitor, result_monitor):
   436      event_count = nexmark_util.get_counter_metric(
   437          result,
   438          event_monitor.namespace,
   439          event_monitor.name_prefix + MonitorSuffix.ELEMENT_COUNTER)
   440      event_start = nexmark_util.get_start_time_metric(
   441          result,
   442          event_monitor.namespace,
   443          event_monitor.name_prefix + MonitorSuffix.EVENT_TIME)
   444      event_end = nexmark_util.get_end_time_metric(
   445          result,
   446          event_monitor.namespace,
   447          event_monitor.name_prefix + MonitorSuffix.EVENT_TIME)
   448      result_count = nexmark_util.get_counter_metric(
   449          result,
   450          result_monitor.namespace,
   451          result_monitor.name_prefix + MonitorSuffix.ELEMENT_COUNTER)
   452      result_end = nexmark_util.get_end_time_metric(
   453          result,
   454          result_monitor.namespace,
   455          result_monitor.name_prefix + MonitorSuffix.EVENT_TIME)
   456  
   457      perf = NexmarkPerf()
   458      perf.event_count = event_count
   459      perf.result_count = result_count
   460      effective_end = max(event_end, result_end)
   461      if effective_end >= 0 and event_start >= 0:
   462        perf.runtime_sec = (effective_end - event_start) / 1000
   463      if event_count >= 0 and perf.runtime_sec > 0:
   464        perf.event_per_sec = event_count / perf.runtime_sec
   465  
   466      return perf
   467  
   468    def cleanup(self):
   469      if self.manage_resources:
   470        from google.cloud import pubsub
   471        publish_client = pubsub.Client(project=self.project)
   472        topic = publish_client.topic(self.topic_name)
   473        if topic.exists():
   474          logging.info('deleting topic %s', self.topic_name)
   475          topic.delete()
   476        sub = topic.subscription(self.subscription_name)
   477        if sub.exists():
   478          logging.info('deleting sub %s', self.topic_name)
   479          sub.delete()
   480  
   481    def run(self):
   482      queries = {
   483          0: query0,
   484          1: query1,
   485          2: query2,
   486          3: query3,
   487          4: query4,
   488          5: query5,
   489          6: query6,
   490          7: query7,
   491          8: query8,
   492          9: query9,
   493          10: query10,
   494          11: query11,
   495          12: query12
   496      }
   497  
   498      # TODO(mariagh): Move to a config file.
   499      query_args = {
   500          'auction_skip': 123,
   501          'window_size_sec': 10,
   502          'window_period_sec': 5,
   503          'fanout': 5,
   504          'num_max_workers': 5,
   505          'max_log_events': 100000,
   506          'occasional_delay_sec': 3,
   507          'max_auction_waiting_time': 600
   508      }
   509  
   510      query_errors = []
   511      for i in self.args.query:
   512        logging.info('Running query %d', i)
   513        self.run_query(
   514            i,
   515            queries[i],
   516            query_args,
   517            self.pipeline_options,
   518            query_errors=query_errors)
   519  
   520      if query_errors:
   521        logging.error('Query failed with %s', ', '.join(query_errors))
   522      else:
   523        logging.info('Queries run: %s', self.args.query)
   524  
   525  
   526  if __name__ == '__main__':
   527    launcher = NexmarkLauncher()
   528    launcher.run()
   529    launcher.cleanup()