github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/snippets/transforms/elementwise/flatmap.py (about)

     1  # coding=utf-8
     2  #
     3  # Licensed to the Apache Software Foundation (ASF) under one or more
     4  # contributor license agreements.  See the NOTICE file distributed with
     5  # this work for additional information regarding copyright ownership.
     6  # The ASF licenses this file to You under the Apache License, Version 2.0
     7  # (the "License"); you may not use this file except in compliance with
     8  # the License.  You may obtain a copy of the License at
     9  #
    10  #    http://www.apache.org/licenses/LICENSE-2.0
    11  #
    12  # Unless required by applicable law or agreed to in writing, software
    13  # distributed under the License is distributed on an "AS IS" BASIS,
    14  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15  # See the License for the specific language governing permissions and
    16  # limitations under the License.
    17  #
    18  
    19  # pytype: skip-file
    20  
    21  
    22  def flatmap_simple(test=None):
    23    # [START flatmap_simple]
    24    import apache_beam as beam
    25  
    26    with beam.Pipeline() as pipeline:
    27      plants = (
    28          pipeline
    29          | 'Gardening plants' >> beam.Create([
    30              '🍓Strawberry 🥕Carrot 🍆Eggplant',
    31              '🍅Tomato 🥔Potato',
    32          ])
    33          | 'Split words' >> beam.FlatMap(str.split)
    34          | beam.Map(print))
    35      # [END flatmap_simple]
    36      if test:
    37        test(plants)
    38  
    39  
    40  def flatmap_function(test=None):
    41    # [START flatmap_function]
    42    import apache_beam as beam
    43  
    44    def split_words(text):
    45      return text.split(',')
    46  
    47    with beam.Pipeline() as pipeline:
    48      plants = (
    49          pipeline
    50          | 'Gardening plants' >> beam.Create([
    51              '🍓Strawberry,🥕Carrot,🍆Eggplant',
    52              '🍅Tomato,🥔Potato',
    53          ])
    54          | 'Split words' >> beam.FlatMap(split_words)
    55          | beam.Map(print))
    56      # [END flatmap_function]
    57      if test:
    58        test(plants)
    59  
    60  
    61  def flatmap_lambda(test=None):
    62    # [START flatmap_lambda]
    63    import apache_beam as beam
    64  
    65    with beam.Pipeline() as pipeline:
    66      plants = (
    67          pipeline
    68          | 'Gardening plants' >> beam.Create([
    69              ['🍓Strawberry', '🥕Carrot', '🍆Eggplant'],
    70              ['🍅Tomato', '🥔Potato'],
    71          ])
    72          | 'Flatten lists' >> beam.FlatMap(lambda elements: elements)
    73          | beam.Map(print))
    74      # [END flatmap_lambda]
    75      if test:
    76        test(plants)
    77  
    78  
    79  def flatmap_generator(test=None):
    80    # [START flatmap_generator]
    81    import apache_beam as beam
    82  
    83    def generate_elements(elements):
    84      for element in elements:
    85        yield element
    86  
    87    with beam.Pipeline() as pipeline:
    88      plants = (
    89          pipeline
    90          | 'Gardening plants' >> beam.Create([
    91              ['🍓Strawberry', '🥕Carrot', '🍆Eggplant'],
    92              ['🍅Tomato', '🥔Potato'],
    93          ])
    94          | 'Flatten lists' >> beam.FlatMap(generate_elements)
    95          | beam.Map(print))
    96      # [END flatmap_generator]
    97      if test:
    98        test(plants)
    99  
   100  
   101  def flatmap_multiple_arguments(test=None):
   102    # [START flatmap_multiple_arguments]
   103    import apache_beam as beam
   104  
   105    def split_words(text, delimiter=None):
   106      return text.split(delimiter)
   107  
   108    with beam.Pipeline() as pipeline:
   109      plants = (
   110          pipeline
   111          | 'Gardening plants' >> beam.Create([
   112              '🍓Strawberry,🥕Carrot,🍆Eggplant',
   113              '🍅Tomato,🥔Potato',
   114          ])
   115          | 'Split words' >> beam.FlatMap(split_words, delimiter=',')
   116          | beam.Map(print))
   117      # [END flatmap_multiple_arguments]
   118      if test:
   119        test(plants)
   120  
   121  
   122  def flatmap_tuple(test=None):
   123    # [START flatmap_tuple]
   124    import apache_beam as beam
   125  
   126    def format_plant(icon, plant):
   127      if icon:
   128        yield '{}{}'.format(icon, plant)
   129  
   130    with beam.Pipeline() as pipeline:
   131      plants = (
   132          pipeline
   133          | 'Gardening plants' >> beam.Create([
   134              ('🍓', 'Strawberry'),
   135              ('🥕', 'Carrot'),
   136              ('🍆', 'Eggplant'),
   137              ('🍅', 'Tomato'),
   138              ('🥔', 'Potato'),
   139              (None, 'Invalid'),
   140          ])
   141          | 'Format' >> beam.FlatMapTuple(format_plant)
   142          | beam.Map(print))
   143      # [END flatmap_tuple]
   144      if test:
   145        test(plants)
   146  
   147  
   148  def flatmap_side_inputs_singleton(test=None):
   149    # [START flatmap_side_inputs_singleton]
   150    import apache_beam as beam
   151  
   152    with beam.Pipeline() as pipeline:
   153      delimiter = pipeline | 'Create delimiter' >> beam.Create([','])
   154  
   155      plants = (
   156          pipeline
   157          | 'Gardening plants' >> beam.Create([
   158              '🍓Strawberry,🥕Carrot,🍆Eggplant',
   159              '🍅Tomato,🥔Potato',
   160          ])
   161          | 'Split words' >> beam.FlatMap(
   162              lambda text,
   163              delimiter: text.split(delimiter),
   164              delimiter=beam.pvalue.AsSingleton(delimiter),
   165          )
   166          | beam.Map(print))
   167      # [END flatmap_side_inputs_singleton]
   168      if test:
   169        test(plants)
   170  
   171  
   172  def flatmap_side_inputs_iter(test=None):
   173    # [START flatmap_side_inputs_iter]
   174    import apache_beam as beam
   175  
   176    def normalize_and_validate_durations(plant, valid_durations):
   177      plant['duration'] = plant['duration'].lower()
   178      if plant['duration'] in valid_durations:
   179        yield plant
   180  
   181    with beam.Pipeline() as pipeline:
   182      valid_durations = pipeline | 'Valid durations' >> beam.Create([
   183          'annual',
   184          'biennial',
   185          'perennial',
   186      ])
   187  
   188      valid_plants = (
   189          pipeline
   190          | 'Gardening plants' >> beam.Create([
   191              {
   192                  'icon': '🍓', 'name': 'Strawberry', 'duration': 'Perennial'
   193              },
   194              {
   195                  'icon': '🥕', 'name': 'Carrot', 'duration': 'BIENNIAL'
   196              },
   197              {
   198                  'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'
   199              },
   200              {
   201                  'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'
   202              },
   203              {
   204                  'icon': '🥔', 'name': 'Potato', 'duration': 'unknown'
   205              },
   206          ])
   207          | 'Normalize and validate durations' >> beam.FlatMap(
   208              normalize_and_validate_durations,
   209              valid_durations=beam.pvalue.AsIter(valid_durations),
   210          )
   211          | beam.Map(print))
   212      # [END flatmap_side_inputs_iter]
   213      if test:
   214        test(valid_plants)
   215  
   216  
   217  def flatmap_side_inputs_dict(test=None):
   218    # [START flatmap_side_inputs_dict]
   219    import apache_beam as beam
   220  
   221    def replace_duration_if_valid(plant, durations):
   222      if plant['duration'] in durations:
   223        plant['duration'] = durations[plant['duration']]
   224        yield plant
   225  
   226    with beam.Pipeline() as pipeline:
   227      durations = pipeline | 'Durations dict' >> beam.Create([
   228          (0, 'annual'),
   229          (1, 'biennial'),
   230          (2, 'perennial'),
   231      ])
   232  
   233      valid_plants = (
   234          pipeline
   235          | 'Gardening plants' >> beam.Create([
   236              {
   237                  'icon': '🍓', 'name': 'Strawberry', 'duration': 2
   238              },
   239              {
   240                  'icon': '🥕', 'name': 'Carrot', 'duration': 1
   241              },
   242              {
   243                  'icon': '🍆', 'name': 'Eggplant', 'duration': 2
   244              },
   245              {
   246                  'icon': '🍅', 'name': 'Tomato', 'duration': 0
   247              },
   248              {
   249                  'icon': '🥔', 'name': 'Potato', 'duration': -1
   250              },
   251          ])
   252          | 'Replace duration if valid' >> beam.FlatMap(
   253              replace_duration_if_valid,
   254              durations=beam.pvalue.AsDict(durations),
   255          )
   256          | beam.Map(print))
   257      # [END flatmap_side_inputs_dict]
   258      if test:
   259        test(valid_plants)