github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/coders.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A workflow using custom JSON-based coders for text sources and sinks.
    19  
    20  The input file contains a JSON string on each line describing a match
    21  record using the following schema:
    22  
    23    {'guest': [TEAM_NAME, GOALS], 'host': [TEAM_NAME, GOALS]}
    24  
    25  The output file will contain the computed points for each team with one team
    26  per line in the following format:
    27  
    28    [TEAM_NAME, POINTS]
    29  """
    30  
    31  # pytype: skip-file
    32  
    33  import argparse
    34  import json
    35  import logging
    36  
    37  import apache_beam as beam
    38  from apache_beam.coders import Coder
    39  from apache_beam.io import ReadFromText
    40  from apache_beam.io import WriteToText
    41  from apache_beam.options.pipeline_options import PipelineOptions
    42  from apache_beam.options.pipeline_options import SetupOptions
    43  
    44  
    45  class JsonCoder(Coder):
    46    """A JSON coder interpreting each line as a JSON string."""
    47    def encode(self, x):
    48      return json.dumps(x).encode('utf-8')
    49  
    50    def decode(self, x):
    51      return json.loads(x)
    52  
    53  
    54  def compute_points(record):
    55    """Compute points based on the record containing the match result.
    56  
    57    The function assigns 3 points for a win, 1 point for a draw, and 0 points for
    58    a loss (see http://en.wikipedia.org/wiki/Three_points_for_a_win).
    59    """
    60    host_name, host_goals = record['host']
    61    guest_name, guest_goals = record['guest']
    62    if host_goals == guest_goals:
    63      yield host_name, 1
    64      yield guest_name, 1
    65    elif host_goals > guest_goals:
    66      yield host_name, 3
    67      yield guest_name, 0
    68    else:
    69      yield host_name, 0
    70      yield guest_name, 3
    71  
    72  
    73  def run(argv=None):
    74    """Runs the workflow computing total points from a collection of matches."""
    75  
    76    parser = argparse.ArgumentParser()
    77    parser.add_argument('--input', required=True, help='Input file to process.')
    78    parser.add_argument(
    79        '--output', required=True, help='Output file to write results to.')
    80    known_args, pipeline_args = parser.parse_known_args(argv)
    81    # We use the save_main_session option because one or more DoFn's in this
    82    # workflow rely on global context (e.g., a module imported at module level).
    83    pipeline_options = PipelineOptions(pipeline_args)
    84    pipeline_options.view_as(SetupOptions).save_main_session = True
    85  
    86    with beam.Pipeline(options=pipeline_options) as p:
    87      (  # pylint: disable=expression-not-assigned
    88          p
    89          | 'read' >> ReadFromText(known_args.input, coder=JsonCoder())
    90          | 'points' >> beam.FlatMap(compute_points)
    91          | beam.CombinePerKey(sum)
    92          | 'write' >> WriteToText(known_args.output, coder=JsonCoder()))
    93  
    94  
    95  if __name__ == '__main__':
    96    logging.getLogger().setLevel(logging.INFO)
    97    run()