github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/game/game_stats.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Fourth in a series of four pipelines that tell a story in a 'gaming' domain. 19 20 New concepts: session windows and finding session duration; use of both 21 singleton and non-singleton side inputs. 22 23 This pipeline builds on the {@link LeaderBoard} functionality, and adds some 24 "business intelligence" analysis: abuse detection and usage patterns. The 25 pipeline derives the Mean user score sum for a window, and uses that information 26 to identify likely spammers/robots. (The robots have a higher click rate than 27 the human users). The 'robot' users are then filtered out when calculating the 28 team scores. 29 30 Additionally, user sessions are tracked: that is, we find bursts of user 31 activity using session windows. Then, the mean session duration information is 32 recorded in the context of subsequent fixed windowing. (This could be used to 33 tell us what games are giving us greater user retention). 34 35 Run injector.Injector to generate pubsub data for this pipeline. The Injector 36 documentation provides more detail on how to do this. The injector is currently 37 implemented in Java only, it can be used from the Java SDK. 38 39 The PubSub topic you specify should be the same topic to which the Injector is 40 publishing. 41 42 To run the Java injector: 43 <beam_root>/examples/java$ mvn compile exec:java \ 44 -Dexec.mainClass=org.apache.beam.examples.complete.game.injector.Injector \ 45 -Dexec.args="$PROJECT_ID $PUBSUB_TOPIC none" 46 47 For a description of the usage and options, use -h or --help. 48 49 To specify a different runner: 50 --runner YOUR_RUNNER 51 52 NOTE: When specifying a different runner, additional runner-specific options 53 may have to be passed in as well 54 55 EXAMPLES 56 -------- 57 58 # DirectRunner 59 python game_stats.py \ 60 --project $PROJECT_ID \ 61 --topic projects/$PROJECT_ID/topics/$PUBSUB_TOPIC \ 62 --dataset $BIGQUERY_DATASET 63 64 # DataflowRunner 65 python game_stats.py \ 66 --project $PROJECT_ID \ 67 --region $REGION_ID \ 68 --topic projects/$PROJECT_ID/topics/$PUBSUB_TOPIC \ 69 --dataset $BIGQUERY_DATASET \ 70 --runner DataflowRunner \ 71 --temp_location gs://$BUCKET/user_score/temp 72 """ 73 74 # pytype: skip-file 75 76 import argparse 77 import csv 78 import logging 79 import sys 80 import time 81 from datetime import datetime 82 83 import apache_beam as beam 84 from apache_beam.metrics.metric import Metrics 85 from apache_beam.options.pipeline_options import GoogleCloudOptions 86 from apache_beam.options.pipeline_options import PipelineOptions 87 from apache_beam.options.pipeline_options import SetupOptions 88 from apache_beam.options.pipeline_options import StandardOptions 89 90 91 def timestamp2str(t, fmt='%Y-%m-%d %H:%M:%S.000'): 92 """Converts a unix timestamp into a formatted string.""" 93 return datetime.fromtimestamp(t).strftime(fmt) 94 95 96 class ParseGameEventFn(beam.DoFn): 97 """Parses the raw game event info into a Python dictionary. 98 99 Each event line has the following format: 100 username,teamname,score,timestamp_in_ms,readable_time 101 102 e.g.: 103 user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224 104 105 The human-readable time string is not used here. 106 """ 107 def __init__(self): 108 # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3. 109 # super().__init__() 110 beam.DoFn.__init__(self) 111 self.num_parse_errors = Metrics.counter(self.__class__, 'num_parse_errors') 112 113 def process(self, elem): 114 try: 115 row = list(csv.reader([elem]))[0] 116 yield { 117 'user': row[0], 118 'team': row[1], 119 'score': int(row[2]), 120 'timestamp': int(row[3]) / 1000.0, 121 } 122 except: # pylint: disable=bare-except 123 # Log and count parse errors 124 self.num_parse_errors.inc() 125 logging.error('Parse error on "%s"', elem) 126 127 128 class ExtractAndSumScore(beam.PTransform): 129 """A transform to extract key/score information and sum the scores. 130 The constructor argument `field` determines whether 'team' or 'user' info is 131 extracted. 132 """ 133 def __init__(self, field): 134 # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3. 135 # super().__init__() 136 beam.PTransform.__init__(self) 137 self.field = field 138 139 def expand(self, pcoll): 140 return ( 141 pcoll 142 | beam.Map(lambda elem: (elem[self.field], elem['score'])) 143 | beam.CombinePerKey(sum)) 144 145 146 class TeamScoresDict(beam.DoFn): 147 """Formats the data into a dictionary of BigQuery columns with their values 148 149 Receives a (team, score) pair, extracts the window start timestamp, and 150 formats everything together into a dictionary. The dictionary is in the format 151 {'bigquery_column': value} 152 """ 153 def process(self, team_score, window=beam.DoFn.WindowParam): 154 team, score = team_score 155 start = timestamp2str(int(window.start)) 156 yield { 157 'team': team, 158 'total_score': score, 159 'window_start': start, 160 'processing_time': timestamp2str(int(time.time())) 161 } 162 163 164 class WriteToBigQuery(beam.PTransform): 165 """Generate, format, and write BigQuery table row information.""" 166 def __init__(self, table_name, dataset, schema, project): 167 """Initializes the transform. 168 Args: 169 table_name: Name of the BigQuery table to use. 170 dataset: Name of the dataset to use. 171 schema: Dictionary in the format {'column_name': 'bigquery_type'} 172 project: Name of the Cloud project containing BigQuery table. 173 """ 174 # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3. 175 # super().__init__() 176 beam.PTransform.__init__(self) 177 self.table_name = table_name 178 self.dataset = dataset 179 self.schema = schema 180 self.project = project 181 182 def get_schema(self): 183 """Build the output table schema.""" 184 return ', '.join('%s:%s' % (col, self.schema[col]) for col in self.schema) 185 186 def expand(self, pcoll): 187 return ( 188 pcoll 189 | 'ConvertToRow' >> 190 beam.Map(lambda elem: {col: elem[col] 191 for col in self.schema}) 192 | beam.io.WriteToBigQuery( 193 self.table_name, self.dataset, self.project, self.get_schema())) 194 195 196 # [START abuse_detect] 197 class CalculateSpammyUsers(beam.PTransform): 198 """Filter out all but those users with a high clickrate, which we will 199 consider as 'spammy' uesrs. 200 201 We do this by finding the mean total score per user, then using that 202 information as a side input to filter out all but those user scores that are 203 larger than (mean * SCORE_WEIGHT). 204 """ 205 SCORE_WEIGHT = 2.5 206 207 def expand(self, user_scores): 208 # Get the sum of scores for each user. 209 sum_scores = (user_scores | 'SumUsersScores' >> beam.CombinePerKey(sum)) 210 211 # Extract the score from each element, and use it to find the global mean. 212 global_mean_score = ( 213 sum_scores 214 | beam.Values() 215 | beam.CombineGlobally(beam.combiners.MeanCombineFn())\ 216 .as_singleton_view()) 217 218 # Filter the user sums using the global mean. 219 filtered = ( 220 sum_scores 221 # Use the derived mean total score (global_mean_score) as a side input. 222 | 'ProcessAndFilter' >> beam.Filter( 223 lambda key_score, global_mean:\ 224 key_score[1] > global_mean * self.SCORE_WEIGHT, 225 global_mean_score)) 226 return filtered 227 228 229 # [END abuse_detect] 230 231 232 class UserSessionActivity(beam.DoFn): 233 """Calculate and output an element's session duration, in seconds.""" 234 def process(self, elem, window=beam.DoFn.WindowParam): 235 yield (window.end.micros - window.start.micros) // 1000000 236 237 238 def run(argv=None, save_main_session=True): 239 """Main entry point; defines and runs the hourly_team_score pipeline.""" 240 parser = argparse.ArgumentParser() 241 242 parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from') 243 parser.add_argument( 244 '--subscription', type=str, help='Pub/Sub subscription to read from') 245 parser.add_argument( 246 '--dataset', 247 type=str, 248 required=True, 249 help='BigQuery Dataset to write tables to. ' 250 'Must already exist.') 251 parser.add_argument( 252 '--table_name', 253 type=str, 254 default='game_stats', 255 help='The BigQuery table name. Should not already exist.') 256 parser.add_argument( 257 '--fixed_window_duration', 258 type=int, 259 default=60, 260 help='Numeric value of fixed window duration for user ' 261 'analysis, in minutes') 262 parser.add_argument( 263 '--session_gap', 264 type=int, 265 default=5, 266 help='Numeric value of gap between user sessions, ' 267 'in minutes') 268 parser.add_argument( 269 '--user_activity_window_duration', 270 type=int, 271 default=30, 272 help='Numeric value of fixed window for finding mean of ' 273 'user session duration, in minutes') 274 275 args, pipeline_args = parser.parse_known_args(argv) 276 277 if args.topic is None and args.subscription is None: 278 parser.print_usage() 279 print(sys.argv[0] + ': error: one of --topic or --subscription is required') 280 sys.exit(1) 281 282 options = PipelineOptions(pipeline_args) 283 284 # We also require the --project option to access --dataset 285 if options.view_as(GoogleCloudOptions).project is None: 286 parser.print_usage() 287 print(sys.argv[0] + ': error: argument --project is required') 288 sys.exit(1) 289 290 fixed_window_duration = args.fixed_window_duration * 60 291 session_gap = args.session_gap * 60 292 user_activity_window_duration = args.user_activity_window_duration * 60 293 294 # We use the save_main_session option because one or more DoFn's in this 295 # workflow rely on global context (e.g., a module imported at module level). 296 options.view_as(SetupOptions).save_main_session = save_main_session 297 298 # Enforce that this pipeline is always run in streaming mode 299 options.view_as(StandardOptions).streaming = True 300 301 with beam.Pipeline(options=options) as p: 302 # Read game events from Pub/Sub using custom timestamps, which 303 # are extracted from the data elements, and parse the data. 304 if args.subscription: 305 scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub( 306 subscription=args.subscription) 307 else: 308 scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(topic=args.topic) 309 raw_events = ( 310 scores 311 | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8')) 312 | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn()) 313 | 'AddEventTimestamps' >> beam.Map( 314 lambda elem: beam.window.TimestampedValue(elem, elem['timestamp']))) 315 316 # Extract username/score pairs from the event stream 317 user_events = ( 318 raw_events 319 | 'ExtractUserScores' >> 320 beam.Map(lambda elem: (elem['user'], elem['score']))) 321 322 # Calculate the total score per user over fixed windows, and cumulative 323 # updates for late data 324 spammers_view = ( 325 user_events 326 | 'UserFixedWindows' >> beam.WindowInto( 327 beam.window.FixedWindows(fixed_window_duration)) 328 329 # Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. 330 # These might be robots/spammers. 331 | 'CalculateSpammyUsers' >> CalculateSpammyUsers() 332 333 # Derive a view from the collection of spammer users. It will be used as 334 # a side input in calculating the team score sums, below 335 | 'CreateSpammersView' >> beam.CombineGlobally( 336 beam.combiners.ToDictCombineFn()).as_singleton_view()) 337 338 # [START filter_and_calc] 339 # Calculate the total score per team over fixed windows, and emit cumulative 340 # updates for late data. Uses the side input derived above --the set of 341 # suspected robots-- to filter out scores from those users from the sum. 342 # Write the results to BigQuery. 343 ( # pylint: disable=expression-not-assigned 344 raw_events 345 | 'WindowIntoFixedWindows' >> beam.WindowInto( 346 beam.window.FixedWindows(fixed_window_duration)) 347 348 # Filter out the detected spammer users, using the side input derived 349 # above 350 | 'FilterOutSpammers' >> beam.Filter( 351 lambda elem, spammers: elem['user'] not in spammers, spammers_view) 352 # Extract and sum teamname/score pairs from the event data. 353 | 'ExtractAndSumScore' >> ExtractAndSumScore('team') 354 # [END filter_and_calc] 355 | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict()) 356 | 'WriteTeamScoreSums' >> WriteToBigQuery( 357 args.table_name + '_teams', 358 args.dataset, 359 { 360 'team': 'STRING', 361 'total_score': 'INTEGER', 362 'window_start': 'STRING', 363 'processing_time': 'STRING', 364 }, 365 options.view_as(GoogleCloudOptions).project)) 366 367 # [START session_calc] 368 # Detect user sessions-- that is, a burst of activity separated by a gap 369 # from further activity. Find and record the mean session lengths. 370 # This information could help the game designers track the changing user 371 # engagement as their set of game changes. 372 ( # pylint: disable=expression-not-assigned 373 user_events 374 | 'WindowIntoSessions' >> beam.WindowInto( 375 beam.window.Sessions(session_gap), 376 timestamp_combiner=beam.window.TimestampCombiner.OUTPUT_AT_EOW) 377 378 # For this use, we care only about the existence of the session, not any 379 # particular information aggregated over it, so we can just group by key 380 # and assign a "dummy value" of None. 381 | beam.CombinePerKey(lambda _: None) 382 383 # Get the duration of the session 384 | 'UserSessionActivity' >> beam.ParDo(UserSessionActivity()) 385 # [END session_calc] 386 387 # [START rewindow] 388 # Re-window to process groups of session sums according to when the 389 # sessions complete 390 | 'WindowToExtractSessionMean' >> beam.WindowInto( 391 beam.window.FixedWindows(user_activity_window_duration)) 392 393 # Find the mean session duration in each window 394 | beam.CombineGlobally( 395 beam.combiners.MeanCombineFn()).without_defaults() 396 | 'FormatAvgSessionLength' >> 397 beam.Map(lambda elem: {'mean_duration': float(elem)}) 398 | 'WriteAvgSessionLength' >> WriteToBigQuery( 399 args.table_name + '_sessions', 400 args.dataset, { 401 'mean_duration': 'FLOAT', 402 }, 403 options.view_as(GoogleCloudOptions).project)) 404 # [END rewindow] 405 406 407 if __name__ == '__main__': 408 logging.getLogger().setLevel(logging.INFO) 409 run()