github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/mergecontacts.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Merge phone, email, and mailing address information. 19 20 A Dataflow pipeline that merges phone, email, and address information associated 21 with the same names. Each input "database" is a tab-delimited text file pairing 22 names with one phone number/email address/mailing address; multiple entries 23 associated with the same name are allowed. Outputs are a tab-delimited text file 24 with the merged information and another file containing some simple statistics. 25 See mergecontacts_test.py for example inputs and outputs. 26 27 A demonstration of: 28 CoGroupByKey 29 Non-linear pipelines (i.e., pipelines with branches) 30 """ 31 32 # pytype: skip-file 33 34 import argparse 35 import logging 36 import re 37 38 import apache_beam as beam 39 from apache_beam.io import ReadFromText 40 from apache_beam.io import WriteToText 41 from apache_beam.options.pipeline_options import PipelineOptions 42 from apache_beam.options.pipeline_options import SetupOptions 43 from apache_beam.testing.util import assert_that 44 from apache_beam.testing.util import equal_to 45 46 47 def run(argv=None, assert_results=None, save_main_session=True): 48 49 parser = argparse.ArgumentParser() 50 parser.add_argument( 51 '--input_email', 52 required=True, 53 help='Email database, with each line formatted as "name<TAB>email".') 54 parser.add_argument( 55 '--input_phone', 56 required=True, 57 help='Phonebook, with each line formatted as "name<TAB>phone number".') 58 parser.add_argument( 59 '--input_snailmail', 60 required=True, 61 help='Address database, with each line formatted as "name<TAB>address".') 62 parser.add_argument( 63 '--output_tsv', required=True, help='Tab-delimited output file.') 64 parser.add_argument( 65 '--output_stats', 66 required=True, 67 help='Output file for statistics about the input.') 68 known_args, pipeline_args = parser.parse_known_args(argv) 69 # We use the save_main_session option because one or more DoFn's in this 70 # workflow rely on global context (e.g., a module imported at module level). 71 pipeline_options = PipelineOptions(pipeline_args) 72 pipeline_options.view_as(SetupOptions).save_main_session = save_main_session 73 with beam.Pipeline(options=pipeline_options) as p: 74 75 # Helper: read a tab-separated key-value mapping from a text file, 76 # escape all quotes/backslashes, and convert it a PCollection of 77 # (key, value) pairs. 78 def read_kv_textfile(label, textfile): 79 return ( 80 p 81 | 'Read: %s' % label >> ReadFromText(textfile) 82 | 'Backslash: %s' % label >> 83 beam.Map(lambda x: re.sub(r'\\', r'\\\\', x)) 84 | 'EscapeQuotes: %s' % label >> 85 beam.Map(lambda x: re.sub(r'"', r'\"', x)) 86 | 'Split: %s' % label >> beam.Map(lambda x: re.split(r'\t+', x, 1))) 87 88 # Read input databases. 89 email = read_kv_textfile('email', known_args.input_email) 90 phone = read_kv_textfile('phone', known_args.input_phone) 91 snailmail = read_kv_textfile('snailmail', known_args.input_snailmail) 92 93 # Group together all entries under the same name. 94 grouped = (email, phone, snailmail) | 'group_by_name' >> beam.CoGroupByKey() 95 96 # Prepare tab-delimited output; something like this: 97 # "name"<TAB>"email_1,email_2"<TAB>"phone"<TAB>"first_snailmail_only" 98 def format_as_tsv(name_email_phone_snailmail): 99 (name, (email, phone, snailmail)) = name_email_phone_snailmail 100 return '\t'.join([ 101 '"%s"' % name, 102 '"%s"' % ','.join(email), 103 '"%s"' % ','.join(phone), 104 '"%s"' % next(iter(snailmail), '') 105 ]) 106 107 tsv_lines = grouped | beam.Map(format_as_tsv) 108 109 # Compute some stats about our database of people. 110 def without_email(name_email_phone_snailmail): 111 (_, (email, _, _)) = name_email_phone_snailmail 112 return not next(iter(email), None) 113 114 def without_phones(name_email_phone_snailmail): 115 (_, (_, phone, _)) = name_email_phone_snailmail 116 return not next(iter(phone), None) 117 118 def without_address(name_email_phone_snailmail): 119 (_, (_, _, snailmail)) = name_email_phone_snailmail 120 return not next(iter(snailmail), None) 121 122 luddites = grouped | beam.Filter(without_email) # People without email. 123 writers = grouped | beam.Filter(without_phones) # People without phones. 124 nomads = grouped | beam.Filter(without_address) # People without addresses. 125 126 num_luddites = luddites | 'Luddites' >> beam.combiners.Count.Globally() 127 num_writers = writers | 'Writers' >> beam.combiners.Count.Globally() 128 num_nomads = nomads | 'Nomads' >> beam.combiners.Count.Globally() 129 130 # Write tab-delimited output. 131 # pylint: disable=expression-not-assigned 132 tsv_lines | 'WriteTsv' >> WriteToText(known_args.output_tsv) 133 134 # TODO(silviuc): Move the assert_results logic to the unit test. 135 if assert_results is not None: 136 expected_luddites, expected_writers, expected_nomads = assert_results 137 assert_that( 138 num_luddites, equal_to([expected_luddites]), label='assert:luddites') 139 assert_that( 140 num_writers, equal_to([expected_writers]), label='assert:writers') 141 assert_that( 142 num_nomads, equal_to([expected_nomads]), label='assert:nomads') 143 144 145 if __name__ == '__main__': 146 logging.getLogger().setLevel(logging.INFO) 147 run()