github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/snippets/transforms/elementwise/regex.py (about) 1 # coding=utf-8 2 # 3 # Licensed to the Apache Software Foundation (ASF) under one or more 4 # contributor license agreements. See the NOTICE file distributed with 5 # this work for additional information regarding copyright ownership. 6 # The ASF licenses this file to You under the Apache License, Version 2.0 7 # (the "License"); you may not use this file except in compliance with 8 # the License. You may obtain a copy of the License at 9 # 10 # http://www.apache.org/licenses/LICENSE-2.0 11 # 12 # Unless required by applicable law or agreed to in writing, software 13 # distributed under the License is distributed on an "AS IS" BASIS, 14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 # See the License for the specific language governing permissions and 16 # limitations under the License. 17 # 18 19 # pytype: skip-file 20 21 22 def regex_matches(test=None): 23 # [START regex_matches] 24 import apache_beam as beam 25 26 # Matches a named group 'icon', and then two comma-separated groups. 27 regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)' 28 with beam.Pipeline() as pipeline: 29 plants_matches = ( 30 pipeline 31 | 'Garden plants' >> beam.Create([ 32 '🍓, Strawberry, perennial', 33 '🥕, Carrot, biennial ignoring trailing words', 34 '🍆, Eggplant, perennial', 35 '🍅, Tomato, annual', 36 '🥔, Potato, perennial', 37 '# 🍌, invalid, format', 38 'invalid, 🍉, format', 39 ]) 40 | 'Parse plants' >> beam.Regex.matches(regex) 41 | beam.Map(print)) 42 # [END regex_matches] 43 if test: 44 test(plants_matches) 45 46 47 def regex_all_matches(test=None): 48 # [START regex_all_matches] 49 import apache_beam as beam 50 51 # Matches a named group 'icon', and then two comma-separated groups. 52 regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)' 53 with beam.Pipeline() as pipeline: 54 plants_all_matches = ( 55 pipeline 56 | 'Garden plants' >> beam.Create([ 57 '🍓, Strawberry, perennial', 58 '🥕, Carrot, biennial ignoring trailing words', 59 '🍆, Eggplant, perennial', 60 '🍅, Tomato, annual', 61 '🥔, Potato, perennial', 62 '# 🍌, invalid, format', 63 'invalid, 🍉, format', 64 ]) 65 | 'Parse plants' >> beam.Regex.all_matches(regex) 66 | beam.Map(print)) 67 # [END regex_all_matches] 68 if test: 69 test(plants_all_matches) 70 71 72 def regex_matches_kv(test=None): 73 # [START regex_matches_kv] 74 import apache_beam as beam 75 76 # Matches a named group 'icon', and then two comma-separated groups. 77 regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)' 78 with beam.Pipeline() as pipeline: 79 plants_matches_kv = ( 80 pipeline 81 | 'Garden plants' >> beam.Create([ 82 '🍓, Strawberry, perennial', 83 '🥕, Carrot, biennial ignoring trailing words', 84 '🍆, Eggplant, perennial', 85 '🍅, Tomato, annual', 86 '🥔, Potato, perennial', 87 '# 🍌, invalid, format', 88 'invalid, 🍉, format', 89 ]) 90 | 'Parse plants' >> beam.Regex.matches_kv(regex, keyGroup='icon') 91 | beam.Map(print)) 92 # [END regex_matches_kv] 93 if test: 94 test(plants_matches_kv) 95 96 97 def regex_find(test=None): 98 # [START regex_find] 99 import apache_beam as beam 100 101 # Matches a named group 'icon', and then two comma-separated groups. 102 regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)' 103 with beam.Pipeline() as pipeline: 104 plants_matches = ( 105 pipeline 106 | 'Garden plants' >> beam.Create([ 107 '# 🍓, Strawberry, perennial', 108 '# 🥕, Carrot, biennial ignoring trailing words', 109 '# 🍆, Eggplant, perennial - 🍌, Banana, perennial', 110 '# 🍅, Tomato, annual - 🍉, Watermelon, annual', 111 '# 🥔, Potato, perennial', 112 ]) 113 | 'Parse plants' >> beam.Regex.find(regex) 114 | beam.Map(print)) 115 # [END regex_find] 116 if test: 117 test(plants_matches) 118 119 120 def regex_find_all(test=None): 121 # [START regex_find_all] 122 import apache_beam as beam 123 124 # Matches a named group 'icon', and then two comma-separated groups. 125 regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)' 126 with beam.Pipeline() as pipeline: 127 plants_find_all = ( 128 pipeline 129 | 'Garden plants' >> beam.Create([ 130 '# 🍓, Strawberry, perennial', 131 '# 🥕, Carrot, biennial ignoring trailing words', 132 '# 🍆, Eggplant, perennial - 🍌, Banana, perennial', 133 '# 🍅, Tomato, annual - 🍉, Watermelon, annual', 134 '# 🥔, Potato, perennial', 135 ]) 136 | 'Parse plants' >> beam.Regex.find_all(regex) 137 | beam.Map(print)) 138 # [END regex_find_all] 139 if test: 140 test(plants_find_all) 141 142 143 def regex_find_kv(test=None): 144 # [START regex_find_kv] 145 import apache_beam as beam 146 147 # Matches a named group 'icon', and then two comma-separated groups. 148 regex = r'(?P<icon>[^\s,]+), *(\w+), *(\w+)' 149 with beam.Pipeline() as pipeline: 150 plants_matches_kv = ( 151 pipeline 152 | 'Garden plants' >> beam.Create([ 153 '# 🍓, Strawberry, perennial', 154 '# 🥕, Carrot, biennial ignoring trailing words', 155 '# 🍆, Eggplant, perennial - 🍌, Banana, perennial', 156 '# 🍅, Tomato, annual - 🍉, Watermelon, annual', 157 '# 🥔, Potato, perennial', 158 ]) 159 | 'Parse plants' >> beam.Regex.find_kv(regex, keyGroup='icon') 160 | beam.Map(print)) 161 # [END regex_find_kv] 162 if test: 163 test(plants_matches_kv) 164 165 166 def regex_replace_all(test=None): 167 # [START regex_replace_all] 168 import apache_beam as beam 169 170 with beam.Pipeline() as pipeline: 171 plants_replace_all = ( 172 pipeline 173 | 'Garden plants' >> beam.Create([ 174 '🍓 : Strawberry : perennial', 175 '🥕 : Carrot : biennial', 176 '🍆\t:\tEggplant\t:\tperennial', 177 '🍅 : Tomato : annual', 178 '🥔 : Potato : perennial', 179 ]) 180 | 'To CSV' >> beam.Regex.replace_all(r'\s*:\s*', ',') 181 | beam.Map(print)) 182 # [END regex_replace_all] 183 if test: 184 test(plants_replace_all) 185 186 187 def regex_replace_first(test=None): 188 # [START regex_replace_first] 189 import apache_beam as beam 190 191 with beam.Pipeline() as pipeline: 192 plants_replace_first = ( 193 pipeline 194 | 'Garden plants' >> beam.Create([ 195 '🍓, Strawberry, perennial', 196 '🥕, Carrot, biennial', 197 '🍆,\tEggplant, perennial', 198 '🍅, Tomato, annual', 199 '🥔, Potato, perennial', 200 ]) 201 | 'As dictionary' >> beam.Regex.replace_first(r'\s*,\s*', ': ') 202 | beam.Map(print)) 203 # [END regex_replace_first] 204 if test: 205 test(plants_replace_first) 206 207 208 def regex_split(test=None): 209 # [START regex_split] 210 import apache_beam as beam 211 212 with beam.Pipeline() as pipeline: 213 plants_split = ( 214 pipeline 215 | 'Garden plants' >> beam.Create([ 216 '🍓 : Strawberry : perennial', 217 '🥕 : Carrot : biennial', 218 '🍆\t:\tEggplant : perennial', 219 '🍅 : Tomato : annual', 220 '🥔 : Potato : perennial', 221 ]) 222 | 'Parse plants' >> beam.Regex.split(r'\s*:\s*') 223 | beam.Map(print)) 224 # [END regex_split] 225 if test: 226 test(plants_split)