github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/mergecontacts_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Test for the mergecontacts example."""
    19  
    20  # pytype: skip-file
    21  
    22  import logging
    23  import unittest
    24  import uuid
    25  
    26  import pytest
    27  
    28  from apache_beam.examples.cookbook import mergecontacts
    29  from apache_beam.testing.test_pipeline import TestPipeline
    30  from apache_beam.testing.test_utils import create_file
    31  from apache_beam.testing.test_utils import read_files_from_pattern
    32  
    33  
    34  class MergeContactsTest(unittest.TestCase):
    35  
    36    CONTACTS_EMAIL = '\n'.join([
    37        'Nathan Nomad\tnathan@example.com',
    38        'Nicky Nomad\tnicky@example.com',
    39        'Noreen Nomad\tnoreen@example.com',
    40        'Noreen Nomad\tnomad@example.com',
    41        'Robert B\trobert@example.com',
    42        'Silviu C\tsilviu@example.com',
    43        'Tom S\ttom@example.com',
    44        'Wally Writer\twally@example.com',
    45        ''
    46    ])
    47  
    48    CONTACTS_PHONE = '\n'.join([
    49        'Larry Luddite\t724-228-3529',
    50        'Lisa Luddite\t304-277-3504',
    51        'Nathan Nomad\t412-466-8968',
    52        'Nicky Nomad\t724-379-5815',
    53        'Noreen Nomad\t412-472-0145',
    54        'Robert B\t814-865-8799',
    55        'Silviu C\t724-537-0671',
    56        'Tom S\t570-368-3420',
    57        'Tom S\t814-793-9655',
    58        ''
    59    ])
    60  
    61    CONTACTS_SNAILMAIL = '\n'.join([
    62        'Larry Luddite\t1949 Westcott St, Detroit, MI 48222',
    63        'Lisa Luddite\t1949 Westcott St, Detroit, MI 48222',
    64        'Robert B\t601 N 34th St, Seattle, WA 98103',
    65        'Silviu C\t1600 Amphitheatre Pkwy, Mountain View, CA 94043',
    66        'Tom S\t6425 Penn Ave Ste 700, Pittsburgh, PA 15206',
    67        'Wally Writer\t101 Ridge Rd, Chalkyitsik, AK 99788',
    68        ''
    69    ])
    70  
    71    EXPECTED_TSV = '\n'.join([
    72        '\t'.join([
    73            '"Larry Luddite"',
    74            '""',
    75            '"724-228-3529"',
    76            '"1949 Westcott St, Detroit, MI 48222"'
    77        ]),
    78        '\t'.join([
    79            '"Lisa Luddite"',
    80            '""',
    81            '"304-277-3504"',
    82            '"1949 Westcott St, Detroit, MI 48222"'
    83        ]),
    84        '\t'.join(
    85            ['"Nathan Nomad"', '"nathan@example.com"', '"412-466-8968"', '""']),
    86        '\t'.join(
    87            ['"Nicky Nomad"', '"nicky@example.com"', '"724-379-5815"', '""']),
    88        '\t'.join([
    89            '"Noreen Nomad"',
    90            '"nomad@example.com,noreen@example.com"',
    91            '"412-472-0145"',
    92            '""'
    93        ]),
    94        '\t'.join([
    95            '"Robert B"',
    96            '"robert@example.com"',
    97            '"814-865-8799"',
    98            '"601 N 34th St, Seattle, WA 98103"'
    99        ]),
   100        '\t'.join([
   101            '"Silviu C"',
   102            '"silviu@example.com"',
   103            '"724-537-0671"',
   104            '"1600 Amphitheatre Pkwy, Mountain View, CA 94043"'
   105        ]),
   106        '\t'.join([
   107            '"Tom S"',
   108            '"tom@example.com"',
   109            '"570-368-3420,814-793-9655"',
   110            '"6425 Penn Ave Ste 700, Pittsburgh, PA 15206"'
   111        ]),
   112        '\t'.join([
   113            '"Wally Writer"',
   114            '"wally@example.com"',
   115            '""',
   116            '"101 Ridge Rd, Chalkyitsik, AK 99788"'
   117        ]),
   118        ''
   119    ])
   120  
   121    EXPECTED_STATS = '\n'.join(['2 luddites', '1 writers', '3 nomads', ''])
   122  
   123    def normalize_tsv_results(self, tsv_data):
   124      """Sort .tsv file data so we can compare it with expected output."""
   125      lines_in = tsv_data.strip().split('\n')
   126      lines_out = []
   127      for line in lines_in:
   128        name, email, phone, snailmail = line.split('\t')
   129        lines_out.append(
   130            '\t'.join([
   131                name,
   132                '"%s"' % ','.join(sorted(email.strip('"').split(','))),
   133                '"%s"' % ','.join(sorted(phone.strip('"').split(','))),
   134                snailmail
   135            ]))
   136      return '\n'.join(sorted(lines_out)) + '\n'
   137  
   138    @pytest.mark.examples_postcommit
   139    @pytest.mark.sickbay_flink
   140    def test_mergecontacts(self):
   141      test_pipeline = TestPipeline(is_integration_test=True)
   142  
   143      # Setup the files with expected content.
   144      temp_location = test_pipeline.get_option('temp_location')
   145      input_folder = '/'.join([temp_location, str(uuid.uuid4())])
   146      path_email = create_file(
   147          '/'.join([input_folder, 'path_email.txt']), self.CONTACTS_EMAIL)
   148      path_phone = create_file(
   149          '/'.join([input_folder, 'path_phone.txt']), self.CONTACTS_PHONE)
   150      path_snailmail = create_file(
   151          '/'.join([input_folder, 'path_snailmail.txt']), self.CONTACTS_SNAILMAIL)
   152  
   153      result_prefix = '/'.join([temp_location, str(uuid.uuid4()), 'result'])
   154      extra_opts = {
   155          'input_email': path_email,
   156          'input_phone': path_phone,
   157          'input_snailmail': path_snailmail,
   158          'output_tsv': '%s.tsv' % result_prefix,
   159          'output_stats': '%s.stats' % result_prefix
   160      }
   161  
   162      pipeline_opts = test_pipeline.get_full_options_as_args(**extra_opts)
   163      # Prevent ambiguous option error between output in
   164      # args and expected output_tsv and output_stats
   165      output_arg = [i for i in pipeline_opts if i.startswith('--output=')]
   166      if output_arg:
   167        pipeline_opts.remove(output_arg[0])
   168      mergecontacts.run(
   169          pipeline_opts, assert_results=(2, 1, 3), save_main_session=False)
   170  
   171      contents = read_files_from_pattern('%s*' % result_prefix)
   172      self.assertEqual(self.EXPECTED_TSV, self.normalize_tsv_results(contents))
   173  
   174  
   175  if __name__ == '__main__':
   176    logging.getLogger().setLevel(logging.INFO)
   177    unittest.main()