github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/coders.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A workflow using custom JSON-based coders for text sources and sinks. 19 20 The input file contains a JSON string on each line describing a match 21 record using the following schema: 22 23 {'guest': [TEAM_NAME, GOALS], 'host': [TEAM_NAME, GOALS]} 24 25 The output file will contain the computed points for each team with one team 26 per line in the following format: 27 28 [TEAM_NAME, POINTS] 29 """ 30 31 # pytype: skip-file 32 33 import argparse 34 import json 35 import logging 36 37 import apache_beam as beam 38 from apache_beam.coders import Coder 39 from apache_beam.io import ReadFromText 40 from apache_beam.io import WriteToText 41 from apache_beam.options.pipeline_options import PipelineOptions 42 from apache_beam.options.pipeline_options import SetupOptions 43 44 45 class JsonCoder(Coder): 46 """A JSON coder interpreting each line as a JSON string.""" 47 def encode(self, x): 48 return json.dumps(x).encode('utf-8') 49 50 def decode(self, x): 51 return json.loads(x) 52 53 54 def compute_points(record): 55 """Compute points based on the record containing the match result. 56 57 The function assigns 3 points for a win, 1 point for a draw, and 0 points for 58 a loss (see http://en.wikipedia.org/wiki/Three_points_for_a_win). 59 """ 60 host_name, host_goals = record['host'] 61 guest_name, guest_goals = record['guest'] 62 if host_goals == guest_goals: 63 yield host_name, 1 64 yield guest_name, 1 65 elif host_goals > guest_goals: 66 yield host_name, 3 67 yield guest_name, 0 68 else: 69 yield host_name, 0 70 yield guest_name, 3 71 72 73 def run(argv=None): 74 """Runs the workflow computing total points from a collection of matches.""" 75 76 parser = argparse.ArgumentParser() 77 parser.add_argument('--input', required=True, help='Input file to process.') 78 parser.add_argument( 79 '--output', required=True, help='Output file to write results to.') 80 known_args, pipeline_args = parser.parse_known_args(argv) 81 # We use the save_main_session option because one or more DoFn's in this 82 # workflow rely on global context (e.g., a module imported at module level). 83 pipeline_options = PipelineOptions(pipeline_args) 84 pipeline_options.view_as(SetupOptions).save_main_session = True 85 86 with beam.Pipeline(options=pipeline_options) as p: 87 ( # pylint: disable=expression-not-assigned 88 p 89 | 'read' >> ReadFromText(known_args.input, coder=JsonCoder()) 90 | 'points' >> beam.FlatMap(compute_points) 91 | beam.CombinePerKey(sum) 92 | 'write' >> WriteToText(known_args.output, coder=JsonCoder())) 93 94 95 if __name__ == '__main__': 96 logging.getLogger().setLevel(logging.INFO) 97 run()