github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/game/game_stats.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/game/game_stats.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Fourth in a series of four pipelines that tell a story in a 'gaming' domain.
    19  
    20  New concepts: session windows and finding session duration; use of both
    21  singleton and non-singleton side inputs.
    22  
    23  This pipeline builds on the {@link LeaderBoard} functionality, and adds some
    24  "business intelligence" analysis: abuse detection and usage patterns. The
    25  pipeline derives the Mean user score sum for a window, and uses that information
    26  to identify likely spammers/robots. (The robots have a higher click rate than
    27  the human users). The 'robot' users are then filtered out when calculating the
    28  team scores.
    29  
    30  Additionally, user sessions are tracked: that is, we find bursts of user
    31  activity using session windows. Then, the mean session duration information is
    32  recorded in the context of subsequent fixed windowing. (This could be used to
    33  tell us what games are giving us greater user retention).
    34  
    35  Run injector.Injector to generate pubsub data for this pipeline. The Injector
    36  documentation provides more detail on how to do this. The injector is currently
    37  implemented in Java only, it can be used from the Java SDK.
    38  
    39  The PubSub topic you specify should be the same topic to which the Injector is
    40  publishing.
    41  
    42  To run the Java injector:
    43  <beam_root>/examples/java$ mvn compile exec:java \
    44      -Dexec.mainClass=org.apache.beam.examples.complete.game.injector.Injector \
    45      -Dexec.args="$PROJECT_ID $PUBSUB_TOPIC none"
    46  
    47  For a description of the usage and options, use -h or --help.
    48  
    49  To specify a different runner:
    50    --runner YOUR_RUNNER
    51  
    52  NOTE: When specifying a different runner, additional runner-specific options
    53        may have to be passed in as well
    54  
    55  EXAMPLES
    56  --------
    57  
    58  # DirectRunner
    59  python game_stats.py \
    60      --project $PROJECT_ID \
    61      --topic projects/$PROJECT_ID/topics/$PUBSUB_TOPIC \
    62      --dataset $BIGQUERY_DATASET
    63  
    64  # DataflowRunner
    65  python game_stats.py \
    66      --project $PROJECT_ID \
    67      --region $REGION_ID \
    68      --topic projects/$PROJECT_ID/topics/$PUBSUB_TOPIC \
    69      --dataset $BIGQUERY_DATASET \
    70      --runner DataflowRunner \
    71      --temp_location gs://$BUCKET/user_score/temp
    72  """
    73  
    74  # pytype: skip-file
    75  
    76  import argparse
    77  import csv
    78  import logging
    79  import sys
    80  import time
    81  from datetime import datetime
    82  
    83  import apache_beam as beam
    84  from apache_beam.metrics.metric import Metrics
    85  from apache_beam.options.pipeline_options import GoogleCloudOptions
    86  from apache_beam.options.pipeline_options import PipelineOptions
    87  from apache_beam.options.pipeline_options import SetupOptions
    88  from apache_beam.options.pipeline_options import StandardOptions
    89  
    90  
    91  def timestamp2str(t, fmt='%Y-%m-%d %H:%M:%S.000'):
    92    """Converts a unix timestamp into a formatted string."""
    93    return datetime.fromtimestamp(t).strftime(fmt)
    94  
    95  
    96  class ParseGameEventFn(beam.DoFn):
    97    """Parses the raw game event info into a Python dictionary.
    98  
    99    Each event line has the following format:
   100      username,teamname,score,timestamp_in_ms,readable_time
   101  
   102    e.g.:
   103      user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224
   104  
   105    The human-readable time string is not used here.
   106    """
   107    def __init__(self):
   108      # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3.
   109      # super().__init__()
   110      beam.DoFn.__init__(self)
   111      self.num_parse_errors = Metrics.counter(self.__class__, 'num_parse_errors')
   112  
   113    def process(self, elem):
   114      try:
   115        row = list(csv.reader([elem]))[0]
   116        yield {
   117            'user': row[0],
   118            'team': row[1],
   119            'score': int(row[2]),
   120            'timestamp': int(row[3]) / 1000.0,
   121        }
   122      except:  # pylint: disable=bare-except
   123        # Log and count parse errors
   124        self.num_parse_errors.inc()
   125        logging.error('Parse error on "%s"', elem)
   126  
   127  
   128  class ExtractAndSumScore(beam.PTransform):
   129    """A transform to extract key/score information and sum the scores.
   130    The constructor argument `field` determines whether 'team' or 'user' info is
   131    extracted.
   132    """
   133    def __init__(self, field):
   134      # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3.
   135      # super().__init__()
   136      beam.PTransform.__init__(self)
   137      self.field = field
   138  
   139    def expand(self, pcoll):
   140      return (
   141          pcoll
   142          | beam.Map(lambda elem: (elem[self.field], elem['score']))
   143          | beam.CombinePerKey(sum))
   144  
   145  
   146  class TeamScoresDict(beam.DoFn):
   147    """Formats the data into a dictionary of BigQuery columns with their values
   148  
   149    Receives a (team, score) pair, extracts the window start timestamp, and
   150    formats everything together into a dictionary. The dictionary is in the format
   151    {'bigquery_column': value}
   152    """
   153    def process(self, team_score, window=beam.DoFn.WindowParam):
   154      team, score = team_score
   155      start = timestamp2str(int(window.start))
   156      yield {
   157          'team': team,
   158          'total_score': score,
   159          'window_start': start,
   160          'processing_time': timestamp2str(int(time.time()))
   161      }
   162  
   163  
   164  class WriteToBigQuery(beam.PTransform):
   165    """Generate, format, and write BigQuery table row information."""
   166    def __init__(self, table_name, dataset, schema, project):
   167      """Initializes the transform.
   168      Args:
   169        table_name: Name of the BigQuery table to use.
   170        dataset: Name of the dataset to use.
   171        schema: Dictionary in the format {'column_name': 'bigquery_type'}
   172        project: Name of the Cloud project containing BigQuery table.
   173      """
   174      # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3.
   175      # super().__init__()
   176      beam.PTransform.__init__(self)
   177      self.table_name = table_name
   178      self.dataset = dataset
   179      self.schema = schema
   180      self.project = project
   181  
   182    def get_schema(self):
   183      """Build the output table schema."""
   184      return ', '.join('%s:%s' % (col, self.schema[col]) for col in self.schema)
   185  
   186    def expand(self, pcoll):
   187      return (
   188          pcoll
   189          | 'ConvertToRow' >>
   190          beam.Map(lambda elem: {col: elem[col]
   191                                 for col in self.schema})
   192          | beam.io.WriteToBigQuery(
   193              self.table_name, self.dataset, self.project, self.get_schema()))
   194  
   195  
   196  # [START abuse_detect]
   197  class CalculateSpammyUsers(beam.PTransform):
   198    """Filter out all but those users with a high clickrate, which we will
   199    consider as 'spammy' uesrs.
   200  
   201    We do this by finding the mean total score per user, then using that
   202    information as a side input to filter out all but those user scores that are
   203    larger than (mean * SCORE_WEIGHT).
   204    """
   205    SCORE_WEIGHT = 2.5
   206  
   207    def expand(self, user_scores):
   208      # Get the sum of scores for each user.
   209      sum_scores = (user_scores | 'SumUsersScores' >> beam.CombinePerKey(sum))
   210  
   211      # Extract the score from each element, and use it to find the global mean.
   212      global_mean_score = (
   213          sum_scores
   214          | beam.Values()
   215          | beam.CombineGlobally(beam.combiners.MeanCombineFn())\
   216              .as_singleton_view())
   217  
   218      # Filter the user sums using the global mean.
   219      filtered = (
   220          sum_scores
   221          # Use the derived mean total score (global_mean_score) as a side input.
   222          | 'ProcessAndFilter' >> beam.Filter(
   223              lambda key_score, global_mean:\
   224                  key_score[1] > global_mean * self.SCORE_WEIGHT,
   225              global_mean_score))
   226      return filtered
   227  
   228  
   229  # [END abuse_detect]
   230  
   231  
   232  class UserSessionActivity(beam.DoFn):
   233    """Calculate and output an element's session duration, in seconds."""
   234    def process(self, elem, window=beam.DoFn.WindowParam):
   235      yield (window.end.micros - window.start.micros) // 1000000
   236  
   237  
   238  def run(argv=None, save_main_session=True):
   239    """Main entry point; defines and runs the hourly_team_score pipeline."""
   240    parser = argparse.ArgumentParser()
   241  
   242    parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from')
   243    parser.add_argument(
   244        '--subscription', type=str, help='Pub/Sub subscription to read from')
   245    parser.add_argument(
   246        '--dataset',
   247        type=str,
   248        required=True,
   249        help='BigQuery Dataset to write tables to. '
   250        'Must already exist.')
   251    parser.add_argument(
   252        '--table_name',
   253        type=str,
   254        default='game_stats',
   255        help='The BigQuery table name. Should not already exist.')
   256    parser.add_argument(
   257        '--fixed_window_duration',
   258        type=int,
   259        default=60,
   260        help='Numeric value of fixed window duration for user '
   261        'analysis, in minutes')
   262    parser.add_argument(
   263        '--session_gap',
   264        type=int,
   265        default=5,
   266        help='Numeric value of gap between user sessions, '
   267        'in minutes')
   268    parser.add_argument(
   269        '--user_activity_window_duration',
   270        type=int,
   271        default=30,
   272        help='Numeric value of fixed window for finding mean of '
   273        'user session duration, in minutes')
   274  
   275    args, pipeline_args = parser.parse_known_args(argv)
   276  
   277    if args.topic is None and args.subscription is None:
   278      parser.print_usage()
   279      print(sys.argv[0] + ': error: one of --topic or --subscription is required')
   280      sys.exit(1)
   281  
   282    options = PipelineOptions(pipeline_args)
   283  
   284    # We also require the --project option to access --dataset
   285    if options.view_as(GoogleCloudOptions).project is None:
   286      parser.print_usage()
   287      print(sys.argv[0] + ': error: argument --project is required')
   288      sys.exit(1)
   289  
   290    fixed_window_duration = args.fixed_window_duration * 60
   291    session_gap = args.session_gap * 60
   292    user_activity_window_duration = args.user_activity_window_duration * 60
   293  
   294    # We use the save_main_session option because one or more DoFn's in this
   295    # workflow rely on global context (e.g., a module imported at module level).
   296    options.view_as(SetupOptions).save_main_session = save_main_session
   297  
   298    # Enforce that this pipeline is always run in streaming mode
   299    options.view_as(StandardOptions).streaming = True
   300  
   301    with beam.Pipeline(options=options) as p:
   302      # Read game events from Pub/Sub using custom timestamps, which
   303      # are extracted from the data elements, and parse the data.
   304      if args.subscription:
   305        scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
   306            subscription=args.subscription)
   307      else:
   308        scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(topic=args.topic)
   309      raw_events = (
   310          scores
   311          | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8'))
   312          | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn())
   313          | 'AddEventTimestamps' >> beam.Map(
   314              lambda elem: beam.window.TimestampedValue(elem, elem['timestamp'])))
   315  
   316      # Extract username/score pairs from the event stream
   317      user_events = (
   318          raw_events
   319          | 'ExtractUserScores' >>
   320          beam.Map(lambda elem: (elem['user'], elem['score'])))
   321  
   322      # Calculate the total score per user over fixed windows, and cumulative
   323      # updates for late data
   324      spammers_view = (
   325          user_events
   326          | 'UserFixedWindows' >> beam.WindowInto(
   327              beam.window.FixedWindows(fixed_window_duration))
   328  
   329          # Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate.
   330          # These might be robots/spammers.
   331          | 'CalculateSpammyUsers' >> CalculateSpammyUsers()
   332  
   333          # Derive a view from the collection of spammer users. It will be used as
   334          # a side input in calculating the team score sums, below
   335          | 'CreateSpammersView' >> beam.CombineGlobally(
   336              beam.combiners.ToDictCombineFn()).as_singleton_view())
   337  
   338      # [START filter_and_calc]
   339      # Calculate the total score per team over fixed windows, and emit cumulative
   340      # updates for late data. Uses the side input derived above --the set of
   341      # suspected robots-- to filter out scores from those users from the sum.
   342      # Write the results to BigQuery.
   343      (  # pylint: disable=expression-not-assigned
   344          raw_events
   345          | 'WindowIntoFixedWindows' >> beam.WindowInto(
   346              beam.window.FixedWindows(fixed_window_duration))
   347  
   348          # Filter out the detected spammer users, using the side input derived
   349          # above
   350          | 'FilterOutSpammers' >> beam.Filter(
   351              lambda elem, spammers: elem['user'] not in spammers, spammers_view)
   352          # Extract and sum teamname/score pairs from the event data.
   353          | 'ExtractAndSumScore' >> ExtractAndSumScore('team')
   354          # [END filter_and_calc]
   355          | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict())
   356          | 'WriteTeamScoreSums' >> WriteToBigQuery(
   357              args.table_name + '_teams',
   358              args.dataset,
   359              {
   360                  'team': 'STRING',
   361                  'total_score': 'INTEGER',
   362                  'window_start': 'STRING',
   363                  'processing_time': 'STRING',
   364              },
   365              options.view_as(GoogleCloudOptions).project))
   366  
   367      # [START session_calc]
   368      # Detect user sessions-- that is, a burst of activity separated by a gap
   369      # from further activity. Find and record the mean session lengths.
   370      # This information could help the game designers track the changing user
   371      # engagement as their set of game changes.
   372      (  # pylint: disable=expression-not-assigned
   373          user_events
   374          | 'WindowIntoSessions' >> beam.WindowInto(
   375              beam.window.Sessions(session_gap),
   376              timestamp_combiner=beam.window.TimestampCombiner.OUTPUT_AT_EOW)
   377  
   378          # For this use, we care only about the existence of the session, not any
   379          # particular information aggregated over it, so we can just group by key
   380          # and assign a "dummy value" of None.
   381          | beam.CombinePerKey(lambda _: None)
   382  
   383          # Get the duration of the session
   384          | 'UserSessionActivity' >> beam.ParDo(UserSessionActivity())
   385          # [END session_calc]
   386  
   387          # [START rewindow]
   388          # Re-window to process groups of session sums according to when the
   389          # sessions complete
   390          | 'WindowToExtractSessionMean' >> beam.WindowInto(
   391              beam.window.FixedWindows(user_activity_window_duration))
   392  
   393          # Find the mean session duration in each window
   394          | beam.CombineGlobally(
   395              beam.combiners.MeanCombineFn()).without_defaults()
   396          | 'FormatAvgSessionLength' >>
   397          beam.Map(lambda elem: {'mean_duration': float(elem)})
   398          | 'WriteAvgSessionLength' >> WriteToBigQuery(
   399              args.table_name + '_sessions',
   400              args.dataset, {
   401                  'mean_duration': 'FLOAT',
   402              },
   403              options.view_as(GoogleCloudOptions).project))
   404      # [END rewindow]
   405  
   406  
   407  if __name__ == '__main__':
   408    logging.getLogger().setLevel(logging.INFO)
   409    run()