github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/game/leader_board.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/game/leader_board.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Third in a series of four pipelines that tell a story in a 'gaming' domain.
    19  
    20  Concepts include: processing unbounded data using fixed windows; use of custom
    21  timestamps and event-time processing; generation of early/speculative results;
    22  using AccumulationMode.ACCUMULATING to do cumulative processing of late-arriving
    23  data.
    24  
    25  This pipeline processes an unbounded stream of 'game events'. The calculation of
    26  the team scores uses fixed windowing based on event time (the time of the game
    27  play event), not processing time (the time that an event is processed by the
    28  pipeline). The pipeline calculates the sum of scores per team, for each window.
    29  By default, the team scores are calculated using one-hour windows.
    30  
    31  In contrast-- to demo another windowing option-- the user scores are calculated
    32  using a global window, which periodically (every ten minutes) emits cumulative
    33  user score sums.
    34  
    35  In contrast to the previous pipelines in the series, which used static, finite
    36  input data, here we're using an unbounded data source, which lets us provide
    37  speculative results, and allows handling of late data, at much lower latency.
    38  We can use the early/speculative results to keep a 'leaderboard' updated in
    39  near-realtime. Our handling of late data lets us generate correct results,
    40  e.g. for 'team prizes'. We're now outputting window results as they're
    41  calculated, giving us much lower latency than with the previous batch examples.
    42  
    43  Run injector.Injector to generate pubsub data for this pipeline. The Injector
    44  documentation provides more detail on how to do this. The injector is currently
    45  implemented in Java only, it can be used from the Java SDK.
    46  
    47  The PubSub topic you specify should be the same topic to which the Injector is
    48  publishing.
    49  
    50  To run the Java injector:
    51  <beam_root>/examples/java$ mvn compile exec:java \
    52      -Dexec.mainClass=org.apache.beam.examples.complete.game.injector.Injector \
    53      -Dexec.args="$PROJECT_ID $PUBSUB_TOPIC none"
    54  
    55  For a description of the usage and options, use -h or --help.
    56  
    57  To specify a different runner:
    58    --runner YOUR_RUNNER
    59  
    60  NOTE: When specifying a different runner, additional runner-specific options
    61        may have to be passed in as well
    62  
    63  EXAMPLES
    64  --------
    65  
    66  # DirectRunner
    67  python leader_board.py \
    68      --project $PROJECT_ID \
    69      --topic projects/$PROJECT_ID/topics/$PUBSUB_TOPIC \
    70      --dataset $BIGQUERY_DATASET
    71  
    72  # DataflowRunner
    73  python leader_board.py \
    74      --project $PROJECT_ID \
    75      --region $REGION_ID \
    76      --topic projects/$PROJECT_ID/topics/$PUBSUB_TOPIC \
    77      --dataset $BIGQUERY_DATASET \
    78      --runner DataflowRunner \
    79      --temp_location gs://$BUCKET/user_score/temp
    80  """
    81  
    82  # pytype: skip-file
    83  
    84  import argparse
    85  import csv
    86  import logging
    87  import sys
    88  import time
    89  from datetime import datetime
    90  
    91  import apache_beam as beam
    92  from apache_beam.metrics.metric import Metrics
    93  from apache_beam.options.pipeline_options import GoogleCloudOptions
    94  from apache_beam.options.pipeline_options import PipelineOptions
    95  from apache_beam.options.pipeline_options import SetupOptions
    96  from apache_beam.options.pipeline_options import StandardOptions
    97  from apache_beam.transforms import trigger
    98  
    99  
   100  def timestamp2str(t, fmt='%Y-%m-%d %H:%M:%S.000'):
   101    """Converts a unix timestamp into a formatted string."""
   102    return datetime.fromtimestamp(t).strftime(fmt)
   103  
   104  
   105  class ParseGameEventFn(beam.DoFn):
   106    """Parses the raw game event info into a Python dictionary.
   107  
   108    Each event line has the following format:
   109      username,teamname,score,timestamp_in_ms,readable_time
   110  
   111    e.g.:
   112      user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224
   113  
   114    The human-readable time string is not used here.
   115    """
   116    def __init__(self):
   117      # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3.
   118      # super().__init__()
   119      beam.DoFn.__init__(self)
   120      self.num_parse_errors = Metrics.counter(self.__class__, 'num_parse_errors')
   121  
   122    def process(self, elem):
   123      try:
   124        row = list(csv.reader([elem]))[0]
   125        yield {
   126            'user': row[0],
   127            'team': row[1],
   128            'score': int(row[2]),
   129            'timestamp': int(row[3]) / 1000.0,
   130        }
   131      except:  # pylint: disable=bare-except
   132        # Log and count parse errors
   133        self.num_parse_errors.inc()
   134        logging.error('Parse error on "%s"', elem)
   135  
   136  
   137  class ExtractAndSumScore(beam.PTransform):
   138    """A transform to extract key/score information and sum the scores.
   139    The constructor argument `field` determines whether 'team' or 'user' info is
   140    extracted.
   141    """
   142    def __init__(self, field):
   143      # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3.
   144      # super().__init__()
   145      beam.PTransform.__init__(self)
   146      self.field = field
   147  
   148    def expand(self, pcoll):
   149      return (
   150          pcoll
   151          | beam.Map(lambda elem: (elem[self.field], elem['score']))
   152          | beam.CombinePerKey(sum))
   153  
   154  
   155  class TeamScoresDict(beam.DoFn):
   156    """Formats the data into a dictionary of BigQuery columns with their values
   157  
   158    Receives a (team, score) pair, extracts the window start timestamp, and
   159    formats everything together into a dictionary. The dictionary is in the format
   160    {'bigquery_column': value}
   161    """
   162    def process(self, team_score, window=beam.DoFn.WindowParam):
   163      team, score = team_score
   164      start = timestamp2str(int(window.start))
   165      yield {
   166          'team': team,
   167          'total_score': score,
   168          'window_start': start,
   169          'processing_time': timestamp2str(int(time.time()))
   170      }
   171  
   172  
   173  class WriteToBigQuery(beam.PTransform):
   174    """Generate, format, and write BigQuery table row information."""
   175    def __init__(self, table_name, dataset, schema, project):
   176      """Initializes the transform.
   177      Args:
   178        table_name: Name of the BigQuery table to use.
   179        dataset: Name of the dataset to use.
   180        schema: Dictionary in the format {'column_name': 'bigquery_type'}
   181        project: Name of the Cloud project containing BigQuery table.
   182      """
   183      # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3.
   184      # super().__init__()
   185      beam.PTransform.__init__(self)
   186      self.table_name = table_name
   187      self.dataset = dataset
   188      self.schema = schema
   189      self.project = project
   190  
   191    def get_schema(self):
   192      """Build the output table schema."""
   193      return ', '.join('%s:%s' % (col, self.schema[col]) for col in self.schema)
   194  
   195    def expand(self, pcoll):
   196      return (
   197          pcoll
   198          | 'ConvertToRow' >>
   199          beam.Map(lambda elem: {col: elem[col]
   200                                 for col in self.schema})
   201          | beam.io.WriteToBigQuery(
   202              self.table_name, self.dataset, self.project, self.get_schema()))
   203  
   204  
   205  # [START window_and_trigger]
   206  class CalculateTeamScores(beam.PTransform):
   207    """Calculates scores for each team within the configured window duration.
   208  
   209    Extract team/score pairs from the event stream, using hour-long windows by
   210    default.
   211    """
   212    def __init__(self, team_window_duration, allowed_lateness):
   213      # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3.
   214      # super().__init__()
   215      beam.PTransform.__init__(self)
   216      self.team_window_duration = team_window_duration * 60
   217      self.allowed_lateness_seconds = allowed_lateness * 60
   218  
   219    def expand(self, pcoll):
   220      # NOTE: the behavior does not exactly match the Java example
   221      # TODO: allowed_lateness not implemented yet in FixedWindows
   222      # TODO: AfterProcessingTime not implemented yet, replace AfterCount
   223      return (
   224          pcoll
   225          # We will get early (speculative) results as well as cumulative
   226          # processing of late data.
   227          | 'LeaderboardTeamFixedWindows' >> beam.WindowInto(
   228              beam.window.FixedWindows(self.team_window_duration),
   229              trigger=trigger.AfterWatermark(
   230                  trigger.AfterCount(10), trigger.AfterCount(20)),
   231              accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
   232          # Extract and sum teamname/score pairs from the event data.
   233          | 'ExtractAndSumScore' >> ExtractAndSumScore('team'))
   234  
   235  
   236  # [END window_and_trigger]
   237  
   238  
   239  # [START processing_time_trigger]
   240  class CalculateUserScores(beam.PTransform):
   241    """Extract user/score pairs from the event stream using processing time, via
   242    global windowing. Get periodic updates on all users' running scores.
   243    """
   244    def __init__(self, allowed_lateness):
   245      # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3.
   246      # super().__init__()
   247      beam.PTransform.__init__(self)
   248      self.allowed_lateness_seconds = allowed_lateness * 60
   249  
   250    def expand(self, pcoll):
   251      # NOTE: the behavior does not exactly match the Java example
   252      # TODO: allowed_lateness not implemented yet in FixedWindows
   253      # TODO: AfterProcessingTime not implemented yet, replace AfterCount
   254      return (
   255          pcoll
   256          # Get periodic results every ten events.
   257          | 'LeaderboardUserGlobalWindows' >> beam.WindowInto(
   258              beam.window.GlobalWindows(),
   259              trigger=trigger.Repeatedly(trigger.AfterCount(10)),
   260              accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
   261          # Extract and sum username/score pairs from the event data.
   262          | 'ExtractAndSumScore' >> ExtractAndSumScore('user'))
   263  
   264  
   265  # [END processing_time_trigger]
   266  
   267  
   268  def run(argv=None, save_main_session=True):
   269    """Main entry point; defines and runs the hourly_team_score pipeline."""
   270    parser = argparse.ArgumentParser()
   271  
   272    parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from')
   273    parser.add_argument(
   274        '--subscription', type=str, help='Pub/Sub subscription to read from')
   275    parser.add_argument(
   276        '--dataset',
   277        type=str,
   278        required=True,
   279        help='BigQuery Dataset to write tables to. '
   280        'Must already exist.')
   281    parser.add_argument(
   282        '--table_name',
   283        default='leader_board',
   284        help='The BigQuery table name. Should not already exist.')
   285    parser.add_argument(
   286        '--team_window_duration',
   287        type=int,
   288        default=60,
   289        help='Numeric value of fixed window duration for team '
   290        'analysis, in minutes')
   291    parser.add_argument(
   292        '--allowed_lateness',
   293        type=int,
   294        default=120,
   295        help='Numeric value of allowed data lateness, in minutes')
   296  
   297    args, pipeline_args = parser.parse_known_args(argv)
   298  
   299    if args.topic is None and args.subscription is None:
   300      parser.print_usage()
   301      print(sys.argv[0] + ': error: one of --topic or --subscription is required')
   302      sys.exit(1)
   303  
   304    options = PipelineOptions(pipeline_args)
   305  
   306    # We also require the --project option to access --dataset
   307    if options.view_as(GoogleCloudOptions).project is None:
   308      parser.print_usage()
   309      print(sys.argv[0] + ': error: argument --project is required')
   310      sys.exit(1)
   311  
   312    # We use the save_main_session option because one or more DoFn's in this
   313    # workflow rely on global context (e.g., a module imported at module level).
   314    options.view_as(SetupOptions).save_main_session = save_main_session
   315  
   316    # Enforce that this pipeline is always run in streaming mode
   317    options.view_as(StandardOptions).streaming = True
   318  
   319    with beam.Pipeline(options=options) as p:
   320      # Read game events from Pub/Sub using custom timestamps, which are extracted
   321      # from the pubsub data elements, and parse the data.
   322  
   323      # Read from PubSub into a PCollection.
   324      if args.subscription:
   325        scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
   326            subscription=args.subscription)
   327      else:
   328        scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(topic=args.topic)
   329  
   330      events = (
   331          scores
   332          | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8'))
   333          | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn())
   334          | 'AddEventTimestamps' >> beam.Map(
   335              lambda elem: beam.window.TimestampedValue(elem, elem['timestamp'])))
   336  
   337      # Get team scores and write the results to BigQuery
   338      (  # pylint: disable=expression-not-assigned
   339          events
   340          | 'CalculateTeamScores' >> CalculateTeamScores(
   341              args.team_window_duration, args.allowed_lateness)
   342          | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict())
   343          | 'WriteTeamScoreSums' >> WriteToBigQuery(
   344              args.table_name + '_teams',
   345              args.dataset,
   346              {
   347                  'team': 'STRING',
   348                  'total_score': 'INTEGER',
   349                  'window_start': 'STRING',
   350                  'processing_time': 'STRING',
   351              },
   352              options.view_as(GoogleCloudOptions).project))
   353  
   354      def format_user_score_sums(user_score):
   355        (user, score) = user_score
   356        return {'user': user, 'total_score': score}
   357  
   358      # Get user scores and write the results to BigQuery
   359      (  # pylint: disable=expression-not-assigned
   360          events
   361          | 'CalculateUserScores' >> CalculateUserScores(args.allowed_lateness)
   362          | 'FormatUserScoreSums' >> beam.Map(format_user_score_sums)
   363          | 'WriteUserScoreSums' >> WriteToBigQuery(
   364              args.table_name + '_users',
   365              args.dataset, {
   366                  'user': 'STRING',
   367                  'total_score': 'INTEGER',
   368              },
   369              options.view_as(GoogleCloudOptions).project))
   370  
   371  
   372  if __name__ == '__main__':
   373    logging.getLogger().setLevel(logging.INFO)
   374    run()