github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/mergecontacts_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Test for the mergecontacts example.""" 19 20 # pytype: skip-file 21 22 import logging 23 import unittest 24 import uuid 25 26 import pytest 27 28 from apache_beam.examples.cookbook import mergecontacts 29 from apache_beam.testing.test_pipeline import TestPipeline 30 from apache_beam.testing.test_utils import create_file 31 from apache_beam.testing.test_utils import read_files_from_pattern 32 33 34 class MergeContactsTest(unittest.TestCase): 35 36 CONTACTS_EMAIL = '\n'.join([ 37 'Nathan Nomad\tnathan@example.com', 38 'Nicky Nomad\tnicky@example.com', 39 'Noreen Nomad\tnoreen@example.com', 40 'Noreen Nomad\tnomad@example.com', 41 'Robert B\trobert@example.com', 42 'Silviu C\tsilviu@example.com', 43 'Tom S\ttom@example.com', 44 'Wally Writer\twally@example.com', 45 '' 46 ]) 47 48 CONTACTS_PHONE = '\n'.join([ 49 'Larry Luddite\t724-228-3529', 50 'Lisa Luddite\t304-277-3504', 51 'Nathan Nomad\t412-466-8968', 52 'Nicky Nomad\t724-379-5815', 53 'Noreen Nomad\t412-472-0145', 54 'Robert B\t814-865-8799', 55 'Silviu C\t724-537-0671', 56 'Tom S\t570-368-3420', 57 'Tom S\t814-793-9655', 58 '' 59 ]) 60 61 CONTACTS_SNAILMAIL = '\n'.join([ 62 'Larry Luddite\t1949 Westcott St, Detroit, MI 48222', 63 'Lisa Luddite\t1949 Westcott St, Detroit, MI 48222', 64 'Robert B\t601 N 34th St, Seattle, WA 98103', 65 'Silviu C\t1600 Amphitheatre Pkwy, Mountain View, CA 94043', 66 'Tom S\t6425 Penn Ave Ste 700, Pittsburgh, PA 15206', 67 'Wally Writer\t101 Ridge Rd, Chalkyitsik, AK 99788', 68 '' 69 ]) 70 71 EXPECTED_TSV = '\n'.join([ 72 '\t'.join([ 73 '"Larry Luddite"', 74 '""', 75 '"724-228-3529"', 76 '"1949 Westcott St, Detroit, MI 48222"' 77 ]), 78 '\t'.join([ 79 '"Lisa Luddite"', 80 '""', 81 '"304-277-3504"', 82 '"1949 Westcott St, Detroit, MI 48222"' 83 ]), 84 '\t'.join( 85 ['"Nathan Nomad"', '"nathan@example.com"', '"412-466-8968"', '""']), 86 '\t'.join( 87 ['"Nicky Nomad"', '"nicky@example.com"', '"724-379-5815"', '""']), 88 '\t'.join([ 89 '"Noreen Nomad"', 90 '"nomad@example.com,noreen@example.com"', 91 '"412-472-0145"', 92 '""' 93 ]), 94 '\t'.join([ 95 '"Robert B"', 96 '"robert@example.com"', 97 '"814-865-8799"', 98 '"601 N 34th St, Seattle, WA 98103"' 99 ]), 100 '\t'.join([ 101 '"Silviu C"', 102 '"silviu@example.com"', 103 '"724-537-0671"', 104 '"1600 Amphitheatre Pkwy, Mountain View, CA 94043"' 105 ]), 106 '\t'.join([ 107 '"Tom S"', 108 '"tom@example.com"', 109 '"570-368-3420,814-793-9655"', 110 '"6425 Penn Ave Ste 700, Pittsburgh, PA 15206"' 111 ]), 112 '\t'.join([ 113 '"Wally Writer"', 114 '"wally@example.com"', 115 '""', 116 '"101 Ridge Rd, Chalkyitsik, AK 99788"' 117 ]), 118 '' 119 ]) 120 121 EXPECTED_STATS = '\n'.join(['2 luddites', '1 writers', '3 nomads', '']) 122 123 def normalize_tsv_results(self, tsv_data): 124 """Sort .tsv file data so we can compare it with expected output.""" 125 lines_in = tsv_data.strip().split('\n') 126 lines_out = [] 127 for line in lines_in: 128 name, email, phone, snailmail = line.split('\t') 129 lines_out.append( 130 '\t'.join([ 131 name, 132 '"%s"' % ','.join(sorted(email.strip('"').split(','))), 133 '"%s"' % ','.join(sorted(phone.strip('"').split(','))), 134 snailmail 135 ])) 136 return '\n'.join(sorted(lines_out)) + '\n' 137 138 @pytest.mark.examples_postcommit 139 @pytest.mark.sickbay_flink 140 def test_mergecontacts(self): 141 test_pipeline = TestPipeline(is_integration_test=True) 142 143 # Setup the files with expected content. 144 temp_location = test_pipeline.get_option('temp_location') 145 input_folder = '/'.join([temp_location, str(uuid.uuid4())]) 146 path_email = create_file( 147 '/'.join([input_folder, 'path_email.txt']), self.CONTACTS_EMAIL) 148 path_phone = create_file( 149 '/'.join([input_folder, 'path_phone.txt']), self.CONTACTS_PHONE) 150 path_snailmail = create_file( 151 '/'.join([input_folder, 'path_snailmail.txt']), self.CONTACTS_SNAILMAIL) 152 153 result_prefix = '/'.join([temp_location, str(uuid.uuid4()), 'result']) 154 extra_opts = { 155 'input_email': path_email, 156 'input_phone': path_phone, 157 'input_snailmail': path_snailmail, 158 'output_tsv': '%s.tsv' % result_prefix, 159 'output_stats': '%s.stats' % result_prefix 160 } 161 162 pipeline_opts = test_pipeline.get_full_options_as_args(**extra_opts) 163 # Prevent ambiguous option error between output in 164 # args and expected output_tsv and output_stats 165 output_arg = [i for i in pipeline_opts if i.startswith('--output=')] 166 if output_arg: 167 pipeline_opts.remove(output_arg[0]) 168 mergecontacts.run( 169 pipeline_opts, assert_results=(2, 1, 3), save_main_session=False) 170 171 contents = read_files_from_pattern('%s*' % result_prefix) 172 self.assertEqual(self.EXPECTED_TSV, self.normalize_tsv_results(contents)) 173 174 175 if __name__ == '__main__': 176 logging.getLogger().setLevel(logging.INFO) 177 unittest.main()