github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/custom_ptransform.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Various implementations of a Count custom PTransform.
    19  
    20  These example show the different ways you can write custom PTransforms.
    21  """
    22  
    23  # pytype: skip-file
    24  
    25  import argparse
    26  import logging
    27  
    28  import apache_beam as beam
    29  from apache_beam.io import ReadFromText
    30  from apache_beam.io import WriteToText
    31  from apache_beam.options.pipeline_options import PipelineOptions
    32  
    33  # pylint doesn't understand our pipeline syntax:
    34  # pylint:disable=expression-not-assigned
    35  
    36  
    37  class Count1(beam.PTransform):
    38    """Count as a subclass of PTransform, with an apply method."""
    39    def expand(self, pcoll):
    40      return (
    41          pcoll
    42          | 'ParWithOne' >> beam.Map(lambda v: (v, 1))
    43          | beam.CombinePerKey(sum))
    44  
    45  
    46  def run_count1(known_args, options):
    47    """Runs the first example pipeline."""
    48    logging.info('Running first pipeline')
    49    with beam.Pipeline(options=options) as p:
    50      (
    51          p | beam.io.ReadFromText(known_args.input)
    52          | Count1()
    53          | beam.io.WriteToText(known_args.output))
    54  
    55  
    56  @beam.ptransform_fn
    57  def Count2(pcoll):  # pylint: disable=invalid-name
    58    """Count as a decorated function."""
    59    return (
    60        pcoll
    61        | 'PairWithOne' >> beam.Map(lambda v: (v, 1))
    62        | beam.CombinePerKey(sum))
    63  
    64  
    65  def run_count2(known_args, options):
    66    """Runs the second example pipeline."""
    67    logging.info('Running second pipeline')
    68    with beam.Pipeline(options=options) as p:
    69      (
    70          p | ReadFromText(known_args.input)
    71          | Count2()  # pylint: disable=no-value-for-parameter
    72          | WriteToText(known_args.output))
    73  
    74  
    75  @beam.ptransform_fn
    76  def Count3(pcoll, factor=1):  # pylint: disable=invalid-name
    77    """Count as a decorated function with a side input.
    78  
    79    Args:
    80      pcoll: the PCollection passed in from the previous transform
    81      factor: the amount by which to count
    82  
    83    Returns:
    84      A PCollection counting the number of times each unique element occurs.
    85    """
    86    return (
    87        pcoll
    88        | 'PairWithOne' >> beam.Map(lambda v: (v, factor))
    89        | beam.CombinePerKey(sum))
    90  
    91  
    92  def run_count3(known_args, options):
    93    """Runs the third example pipeline."""
    94    logging.info('Running third pipeline')
    95    with beam.Pipeline(options=options) as p:
    96      (
    97          p | ReadFromText(known_args.input)
    98          | Count3(2)  # pylint: disable=no-value-for-parameter
    99          | WriteToText(known_args.output))
   100  
   101  
   102  def get_args(argv):
   103    """Determines user specified arguments from the given list of arguments.
   104  
   105    Args:
   106      argv: all arguments.
   107  
   108    Returns:
   109      A pair of argument lists containing known and remaining arguments.
   110    """
   111  
   112    parser = argparse.ArgumentParser()
   113    parser.add_argument('--input', required=True, help='Input file to process.')
   114    parser.add_argument(
   115        '--output', required=True, help='Output file to write results to.')
   116    return parser.parse_known_args(argv)
   117  
   118  
   119  def run(argv=None):
   120    known_args, pipeline_args = get_args(argv)
   121  
   122    # pipeline initialization may modify PipelineOptions object.
   123    # Create instances for each.
   124    run_count1(known_args, PipelineOptions(pipeline_args))
   125    run_count2(known_args, PipelineOptions(pipeline_args))
   126    run_count3(known_args, PipelineOptions(pipeline_args))
   127  
   128  
   129  if __name__ == '__main__':
   130    logging.getLogger().setLevel(logging.INFO)
   131    run()