github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/snippets/transforms/elementwise/regex.py (about)

     1  # coding=utf-8
     2  #
     3  # Licensed to the Apache Software Foundation (ASF) under one or more
     4  # contributor license agreements.  See the NOTICE file distributed with
     5  # this work for additional information regarding copyright ownership.
     6  # The ASF licenses this file to You under the Apache License, Version 2.0
     7  # (the "License"); you may not use this file except in compliance with
     8  # the License.  You may obtain a copy of the License at
     9  #
    10  #    http://www.apache.org/licenses/LICENSE-2.0
    11  #
    12  # Unless required by applicable law or agreed to in writing, software
    13  # distributed under the License is distributed on an "AS IS" BASIS,
    14  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15  # See the License for the specific language governing permissions and
    16  # limitations under the License.
    17  #
    18  
    19  # pytype: skip-file
    20  
    21  
    22  def regex_matches(test=None):
    23    # [START regex_matches]
    24    import apache_beam as beam
    25  
    26    # Matches a named group 'icon', and then two comma-separated groups.
    27    regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)'
    28    with beam.Pipeline() as pipeline:
    29      plants_matches = (
    30          pipeline
    31          | 'Garden plants' >> beam.Create([
    32              '🍓, Strawberry, perennial',
    33              '🥕, Carrot, biennial ignoring trailing words',
    34              '🍆, Eggplant, perennial',
    35              '🍅, Tomato, annual',
    36              '🥔, Potato, perennial',
    37              '# 🍌, invalid, format',
    38              'invalid, 🍉, format',
    39          ])
    40          | 'Parse plants' >> beam.Regex.matches(regex)
    41          | beam.Map(print))
    42      # [END regex_matches]
    43      if test:
    44        test(plants_matches)
    45  
    46  
    47  def regex_all_matches(test=None):
    48    # [START regex_all_matches]
    49    import apache_beam as beam
    50  
    51    # Matches a named group 'icon', and then two comma-separated groups.
    52    regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)'
    53    with beam.Pipeline() as pipeline:
    54      plants_all_matches = (
    55          pipeline
    56          | 'Garden plants' >> beam.Create([
    57              '🍓, Strawberry, perennial',
    58              '🥕, Carrot, biennial ignoring trailing words',
    59              '🍆, Eggplant, perennial',
    60              '🍅, Tomato, annual',
    61              '🥔, Potato, perennial',
    62              '# 🍌, invalid, format',
    63              'invalid, 🍉, format',
    64          ])
    65          | 'Parse plants' >> beam.Regex.all_matches(regex)
    66          | beam.Map(print))
    67      # [END regex_all_matches]
    68      if test:
    69        test(plants_all_matches)
    70  
    71  
    72  def regex_matches_kv(test=None):
    73    # [START regex_matches_kv]
    74    import apache_beam as beam
    75  
    76    # Matches a named group 'icon', and then two comma-separated groups.
    77    regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)'
    78    with beam.Pipeline() as pipeline:
    79      plants_matches_kv = (
    80          pipeline
    81          | 'Garden plants' >> beam.Create([
    82              '🍓, Strawberry, perennial',
    83              '🥕, Carrot, biennial ignoring trailing words',
    84              '🍆, Eggplant, perennial',
    85              '🍅, Tomato, annual',
    86              '🥔, Potato, perennial',
    87              '# 🍌, invalid, format',
    88              'invalid, 🍉, format',
    89          ])
    90          | 'Parse plants' >> beam.Regex.matches_kv(regex, keyGroup='icon')
    91          | beam.Map(print))
    92      # [END regex_matches_kv]
    93      if test:
    94        test(plants_matches_kv)
    95  
    96  
    97  def regex_find(test=None):
    98    # [START regex_find]
    99    import apache_beam as beam
   100  
   101    # Matches a named group 'icon', and then two comma-separated groups.
   102    regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)'
   103    with beam.Pipeline() as pipeline:
   104      plants_matches = (
   105          pipeline
   106          | 'Garden plants' >> beam.Create([
   107              '# 🍓, Strawberry, perennial',
   108              '# 🥕, Carrot, biennial ignoring trailing words',
   109              '# 🍆, Eggplant, perennial - 🍌, Banana, perennial',
   110              '# 🍅, Tomato, annual - 🍉, Watermelon, annual',
   111              '# 🥔, Potato, perennial',
   112          ])
   113          | 'Parse plants' >> beam.Regex.find(regex)
   114          | beam.Map(print))
   115      # [END regex_find]
   116      if test:
   117        test(plants_matches)
   118  
   119  
   120  def regex_find_all(test=None):
   121    # [START regex_find_all]
   122    import apache_beam as beam
   123  
   124    # Matches a named group 'icon', and then two comma-separated groups.
   125    regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)'
   126    with beam.Pipeline() as pipeline:
   127      plants_find_all = (
   128          pipeline
   129          | 'Garden plants' >> beam.Create([
   130              '# 🍓, Strawberry, perennial',
   131              '# 🥕, Carrot, biennial ignoring trailing words',
   132              '# 🍆, Eggplant, perennial - 🍌, Banana, perennial',
   133              '# 🍅, Tomato, annual - 🍉, Watermelon, annual',
   134              '# 🥔, Potato, perennial',
   135          ])
   136          | 'Parse plants' >> beam.Regex.find_all(regex)
   137          | beam.Map(print))
   138      # [END regex_find_all]
   139      if test:
   140        test(plants_find_all)
   141  
   142  
   143  def regex_find_kv(test=None):
   144    # [START regex_find_kv]
   145    import apache_beam as beam
   146  
   147    # Matches a named group 'icon', and then two comma-separated groups.
   148    regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)'
   149    with beam.Pipeline() as pipeline:
   150      plants_matches_kv = (
   151          pipeline
   152          | 'Garden plants' >> beam.Create([
   153              '# 🍓, Strawberry, perennial',
   154              '# 🥕, Carrot, biennial ignoring trailing words',
   155              '# 🍆, Eggplant, perennial - 🍌, Banana, perennial',
   156              '# 🍅, Tomato, annual - 🍉, Watermelon, annual',
   157              '# 🥔, Potato, perennial',
   158          ])
   159          | 'Parse plants' >> beam.Regex.find_kv(regex, keyGroup='icon')
   160          | beam.Map(print))
   161      # [END regex_find_kv]
   162      if test:
   163        test(plants_matches_kv)
   164  
   165  
   166  def regex_replace_all(test=None):
   167    # [START regex_replace_all]
   168    import apache_beam as beam
   169  
   170    with beam.Pipeline() as pipeline:
   171      plants_replace_all = (
   172          pipeline
   173          | 'Garden plants' >> beam.Create([
   174              '🍓 : Strawberry : perennial',
   175              '🥕 : Carrot : biennial',
   176              '🍆\t:\tEggplant\t:\tperennial',
   177              '🍅 : Tomato : annual',
   178              '🥔 : Potato : perennial',
   179          ])
   180          | 'To CSV' >> beam.Regex.replace_all(r'\s*:\s*', ',')
   181          | beam.Map(print))
   182      # [END regex_replace_all]
   183      if test:
   184        test(plants_replace_all)
   185  
   186  
   187  def regex_replace_first(test=None):
   188    # [START regex_replace_first]
   189    import apache_beam as beam
   190  
   191    with beam.Pipeline() as pipeline:
   192      plants_replace_first = (
   193          pipeline
   194          | 'Garden plants' >> beam.Create([
   195              '🍓, Strawberry, perennial',
   196              '🥕, Carrot, biennial',
   197              '🍆,\tEggplant, perennial',
   198              '🍅, Tomato, annual',
   199              '🥔, Potato, perennial',
   200          ])
   201          | 'As dictionary' >> beam.Regex.replace_first(r'\s*,\s*', ': ')
   202          | beam.Map(print))
   203      # [END regex_replace_first]
   204      if test:
   205        test(plants_replace_first)
   206  
   207  
   208  def regex_split(test=None):
   209    # [START regex_split]
   210    import apache_beam as beam
   211  
   212    with beam.Pipeline() as pipeline:
   213      plants_split = (
   214          pipeline
   215          | 'Garden plants' >> beam.Create([
   216              '🍓 : Strawberry : perennial',
   217              '🥕 : Carrot : biennial',
   218              '🍆\t:\tEggplant : perennial',
   219              '🍅 : Tomato : annual',
   220              '🥔 : Potato : perennial',
   221          ])
   222          | 'Parse plants' >> beam.Regex.split(r'\s*:\s*')
   223          | beam.Map(print))
   224      # [END regex_split]
   225      if test:
   226        test(plants_split)