github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/game/leader_board.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Third in a series of four pipelines that tell a story in a 'gaming' domain. 19 20 Concepts include: processing unbounded data using fixed windows; use of custom 21 timestamps and event-time processing; generation of early/speculative results; 22 using AccumulationMode.ACCUMULATING to do cumulative processing of late-arriving 23 data. 24 25 This pipeline processes an unbounded stream of 'game events'. The calculation of 26 the team scores uses fixed windowing based on event time (the time of the game 27 play event), not processing time (the time that an event is processed by the 28 pipeline). The pipeline calculates the sum of scores per team, for each window. 29 By default, the team scores are calculated using one-hour windows. 30 31 In contrast-- to demo another windowing option-- the user scores are calculated 32 using a global window, which periodically (every ten minutes) emits cumulative 33 user score sums. 34 35 In contrast to the previous pipelines in the series, which used static, finite 36 input data, here we're using an unbounded data source, which lets us provide 37 speculative results, and allows handling of late data, at much lower latency. 38 We can use the early/speculative results to keep a 'leaderboard' updated in 39 near-realtime. Our handling of late data lets us generate correct results, 40 e.g. for 'team prizes'. We're now outputting window results as they're 41 calculated, giving us much lower latency than with the previous batch examples. 42 43 Run injector.Injector to generate pubsub data for this pipeline. The Injector 44 documentation provides more detail on how to do this. The injector is currently 45 implemented in Java only, it can be used from the Java SDK. 46 47 The PubSub topic you specify should be the same topic to which the Injector is 48 publishing. 49 50 To run the Java injector: 51 <beam_root>/examples/java$ mvn compile exec:java \ 52 -Dexec.mainClass=org.apache.beam.examples.complete.game.injector.Injector \ 53 -Dexec.args="$PROJECT_ID $PUBSUB_TOPIC none" 54 55 For a description of the usage and options, use -h or --help. 56 57 To specify a different runner: 58 --runner YOUR_RUNNER 59 60 NOTE: When specifying a different runner, additional runner-specific options 61 may have to be passed in as well 62 63 EXAMPLES 64 -------- 65 66 # DirectRunner 67 python leader_board.py \ 68 --project $PROJECT_ID \ 69 --topic projects/$PROJECT_ID/topics/$PUBSUB_TOPIC \ 70 --dataset $BIGQUERY_DATASET 71 72 # DataflowRunner 73 python leader_board.py \ 74 --project $PROJECT_ID \ 75 --region $REGION_ID \ 76 --topic projects/$PROJECT_ID/topics/$PUBSUB_TOPIC \ 77 --dataset $BIGQUERY_DATASET \ 78 --runner DataflowRunner \ 79 --temp_location gs://$BUCKET/user_score/temp 80 """ 81 82 # pytype: skip-file 83 84 import argparse 85 import csv 86 import logging 87 import sys 88 import time 89 from datetime import datetime 90 91 import apache_beam as beam 92 from apache_beam.metrics.metric import Metrics 93 from apache_beam.options.pipeline_options import GoogleCloudOptions 94 from apache_beam.options.pipeline_options import PipelineOptions 95 from apache_beam.options.pipeline_options import SetupOptions 96 from apache_beam.options.pipeline_options import StandardOptions 97 from apache_beam.transforms import trigger 98 99 100 def timestamp2str(t, fmt='%Y-%m-%d %H:%M:%S.000'): 101 """Converts a unix timestamp into a formatted string.""" 102 return datetime.fromtimestamp(t).strftime(fmt) 103 104 105 class ParseGameEventFn(beam.DoFn): 106 """Parses the raw game event info into a Python dictionary. 107 108 Each event line has the following format: 109 username,teamname,score,timestamp_in_ms,readable_time 110 111 e.g.: 112 user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224 113 114 The human-readable time string is not used here. 115 """ 116 def __init__(self): 117 # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3. 118 # super().__init__() 119 beam.DoFn.__init__(self) 120 self.num_parse_errors = Metrics.counter(self.__class__, 'num_parse_errors') 121 122 def process(self, elem): 123 try: 124 row = list(csv.reader([elem]))[0] 125 yield { 126 'user': row[0], 127 'team': row[1], 128 'score': int(row[2]), 129 'timestamp': int(row[3]) / 1000.0, 130 } 131 except: # pylint: disable=bare-except 132 # Log and count parse errors 133 self.num_parse_errors.inc() 134 logging.error('Parse error on "%s"', elem) 135 136 137 class ExtractAndSumScore(beam.PTransform): 138 """A transform to extract key/score information and sum the scores. 139 The constructor argument `field` determines whether 'team' or 'user' info is 140 extracted. 141 """ 142 def __init__(self, field): 143 # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3. 144 # super().__init__() 145 beam.PTransform.__init__(self) 146 self.field = field 147 148 def expand(self, pcoll): 149 return ( 150 pcoll 151 | beam.Map(lambda elem: (elem[self.field], elem['score'])) 152 | beam.CombinePerKey(sum)) 153 154 155 class TeamScoresDict(beam.DoFn): 156 """Formats the data into a dictionary of BigQuery columns with their values 157 158 Receives a (team, score) pair, extracts the window start timestamp, and 159 formats everything together into a dictionary. The dictionary is in the format 160 {'bigquery_column': value} 161 """ 162 def process(self, team_score, window=beam.DoFn.WindowParam): 163 team, score = team_score 164 start = timestamp2str(int(window.start)) 165 yield { 166 'team': team, 167 'total_score': score, 168 'window_start': start, 169 'processing_time': timestamp2str(int(time.time())) 170 } 171 172 173 class WriteToBigQuery(beam.PTransform): 174 """Generate, format, and write BigQuery table row information.""" 175 def __init__(self, table_name, dataset, schema, project): 176 """Initializes the transform. 177 Args: 178 table_name: Name of the BigQuery table to use. 179 dataset: Name of the dataset to use. 180 schema: Dictionary in the format {'column_name': 'bigquery_type'} 181 project: Name of the Cloud project containing BigQuery table. 182 """ 183 # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3. 184 # super().__init__() 185 beam.PTransform.__init__(self) 186 self.table_name = table_name 187 self.dataset = dataset 188 self.schema = schema 189 self.project = project 190 191 def get_schema(self): 192 """Build the output table schema.""" 193 return ', '.join('%s:%s' % (col, self.schema[col]) for col in self.schema) 194 195 def expand(self, pcoll): 196 return ( 197 pcoll 198 | 'ConvertToRow' >> 199 beam.Map(lambda elem: {col: elem[col] 200 for col in self.schema}) 201 | beam.io.WriteToBigQuery( 202 self.table_name, self.dataset, self.project, self.get_schema())) 203 204 205 # [START window_and_trigger] 206 class CalculateTeamScores(beam.PTransform): 207 """Calculates scores for each team within the configured window duration. 208 209 Extract team/score pairs from the event stream, using hour-long windows by 210 default. 211 """ 212 def __init__(self, team_window_duration, allowed_lateness): 213 # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3. 214 # super().__init__() 215 beam.PTransform.__init__(self) 216 self.team_window_duration = team_window_duration * 60 217 self.allowed_lateness_seconds = allowed_lateness * 60 218 219 def expand(self, pcoll): 220 # NOTE: the behavior does not exactly match the Java example 221 # TODO: allowed_lateness not implemented yet in FixedWindows 222 # TODO: AfterProcessingTime not implemented yet, replace AfterCount 223 return ( 224 pcoll 225 # We will get early (speculative) results as well as cumulative 226 # processing of late data. 227 | 'LeaderboardTeamFixedWindows' >> beam.WindowInto( 228 beam.window.FixedWindows(self.team_window_duration), 229 trigger=trigger.AfterWatermark( 230 trigger.AfterCount(10), trigger.AfterCount(20)), 231 accumulation_mode=trigger.AccumulationMode.ACCUMULATING) 232 # Extract and sum teamname/score pairs from the event data. 233 | 'ExtractAndSumScore' >> ExtractAndSumScore('team')) 234 235 236 # [END window_and_trigger] 237 238 239 # [START processing_time_trigger] 240 class CalculateUserScores(beam.PTransform): 241 """Extract user/score pairs from the event stream using processing time, via 242 global windowing. Get periodic updates on all users' running scores. 243 """ 244 def __init__(self, allowed_lateness): 245 # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3. 246 # super().__init__() 247 beam.PTransform.__init__(self) 248 self.allowed_lateness_seconds = allowed_lateness * 60 249 250 def expand(self, pcoll): 251 # NOTE: the behavior does not exactly match the Java example 252 # TODO: allowed_lateness not implemented yet in FixedWindows 253 # TODO: AfterProcessingTime not implemented yet, replace AfterCount 254 return ( 255 pcoll 256 # Get periodic results every ten events. 257 | 'LeaderboardUserGlobalWindows' >> beam.WindowInto( 258 beam.window.GlobalWindows(), 259 trigger=trigger.Repeatedly(trigger.AfterCount(10)), 260 accumulation_mode=trigger.AccumulationMode.ACCUMULATING) 261 # Extract and sum username/score pairs from the event data. 262 | 'ExtractAndSumScore' >> ExtractAndSumScore('user')) 263 264 265 # [END processing_time_trigger] 266 267 268 def run(argv=None, save_main_session=True): 269 """Main entry point; defines and runs the hourly_team_score pipeline.""" 270 parser = argparse.ArgumentParser() 271 272 parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from') 273 parser.add_argument( 274 '--subscription', type=str, help='Pub/Sub subscription to read from') 275 parser.add_argument( 276 '--dataset', 277 type=str, 278 required=True, 279 help='BigQuery Dataset to write tables to. ' 280 'Must already exist.') 281 parser.add_argument( 282 '--table_name', 283 default='leader_board', 284 help='The BigQuery table name. Should not already exist.') 285 parser.add_argument( 286 '--team_window_duration', 287 type=int, 288 default=60, 289 help='Numeric value of fixed window duration for team ' 290 'analysis, in minutes') 291 parser.add_argument( 292 '--allowed_lateness', 293 type=int, 294 default=120, 295 help='Numeric value of allowed data lateness, in minutes') 296 297 args, pipeline_args = parser.parse_known_args(argv) 298 299 if args.topic is None and args.subscription is None: 300 parser.print_usage() 301 print(sys.argv[0] + ': error: one of --topic or --subscription is required') 302 sys.exit(1) 303 304 options = PipelineOptions(pipeline_args) 305 306 # We also require the --project option to access --dataset 307 if options.view_as(GoogleCloudOptions).project is None: 308 parser.print_usage() 309 print(sys.argv[0] + ': error: argument --project is required') 310 sys.exit(1) 311 312 # We use the save_main_session option because one or more DoFn's in this 313 # workflow rely on global context (e.g., a module imported at module level). 314 options.view_as(SetupOptions).save_main_session = save_main_session 315 316 # Enforce that this pipeline is always run in streaming mode 317 options.view_as(StandardOptions).streaming = True 318 319 with beam.Pipeline(options=options) as p: 320 # Read game events from Pub/Sub using custom timestamps, which are extracted 321 # from the pubsub data elements, and parse the data. 322 323 # Read from PubSub into a PCollection. 324 if args.subscription: 325 scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub( 326 subscription=args.subscription) 327 else: 328 scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(topic=args.topic) 329 330 events = ( 331 scores 332 | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8')) 333 | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn()) 334 | 'AddEventTimestamps' >> beam.Map( 335 lambda elem: beam.window.TimestampedValue(elem, elem['timestamp']))) 336 337 # Get team scores and write the results to BigQuery 338 ( # pylint: disable=expression-not-assigned 339 events 340 | 'CalculateTeamScores' >> CalculateTeamScores( 341 args.team_window_duration, args.allowed_lateness) 342 | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict()) 343 | 'WriteTeamScoreSums' >> WriteToBigQuery( 344 args.table_name + '_teams', 345 args.dataset, 346 { 347 'team': 'STRING', 348 'total_score': 'INTEGER', 349 'window_start': 'STRING', 350 'processing_time': 'STRING', 351 }, 352 options.view_as(GoogleCloudOptions).project)) 353 354 def format_user_score_sums(user_score): 355 (user, score) = user_score 356 return {'user': user, 'total_score': score} 357 358 # Get user scores and write the results to BigQuery 359 ( # pylint: disable=expression-not-assigned 360 events 361 | 'CalculateUserScores' >> CalculateUserScores(args.allowed_lateness) 362 | 'FormatUserScoreSums' >> beam.Map(format_user_score_sums) 363 | 'WriteUserScoreSums' >> WriteToBigQuery( 364 args.table_name + '_users', 365 args.dataset, { 366 'user': 'STRING', 367 'total_score': 'INTEGER', 368 }, 369 options.view_as(GoogleCloudOptions).project)) 370 371 372 if __name__ == '__main__': 373 logging.getLogger().setLevel(logging.INFO) 374 run()