github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/mergecontacts.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Merge phone, email, and mailing address information.
    19  
    20  A Dataflow pipeline that merges phone, email, and address information associated
    21  with the same names. Each input "database" is a tab-delimited text file pairing
    22  names with one phone number/email address/mailing address; multiple entries
    23  associated with the same name are allowed. Outputs are a tab-delimited text file
    24  with the merged information and another file containing some simple statistics.
    25  See mergecontacts_test.py for example inputs and outputs.
    26  
    27  A demonstration of:
    28    CoGroupByKey
    29    Non-linear pipelines (i.e., pipelines with branches)
    30  """
    31  
    32  # pytype: skip-file
    33  
    34  import argparse
    35  import logging
    36  import re
    37  
    38  import apache_beam as beam
    39  from apache_beam.io import ReadFromText
    40  from apache_beam.io import WriteToText
    41  from apache_beam.options.pipeline_options import PipelineOptions
    42  from apache_beam.options.pipeline_options import SetupOptions
    43  from apache_beam.testing.util import assert_that
    44  from apache_beam.testing.util import equal_to
    45  
    46  
    47  def run(argv=None, assert_results=None, save_main_session=True):
    48  
    49    parser = argparse.ArgumentParser()
    50    parser.add_argument(
    51        '--input_email',
    52        required=True,
    53        help='Email database, with each line formatted as "name<TAB>email".')
    54    parser.add_argument(
    55        '--input_phone',
    56        required=True,
    57        help='Phonebook, with each line formatted as "name<TAB>phone number".')
    58    parser.add_argument(
    59        '--input_snailmail',
    60        required=True,
    61        help='Address database, with each line formatted as "name<TAB>address".')
    62    parser.add_argument(
    63        '--output_tsv', required=True, help='Tab-delimited output file.')
    64    parser.add_argument(
    65        '--output_stats',
    66        required=True,
    67        help='Output file for statistics about the input.')
    68    known_args, pipeline_args = parser.parse_known_args(argv)
    69    # We use the save_main_session option because one or more DoFn's in this
    70    # workflow rely on global context (e.g., a module imported at module level).
    71    pipeline_options = PipelineOptions(pipeline_args)
    72    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
    73    with beam.Pipeline(options=pipeline_options) as p:
    74  
    75      # Helper: read a tab-separated key-value mapping from a text file,
    76      # escape all quotes/backslashes, and convert it a PCollection of
    77      # (key, value) pairs.
    78      def read_kv_textfile(label, textfile):
    79        return (
    80            p
    81            | 'Read: %s' % label >> ReadFromText(textfile)
    82            | 'Backslash: %s' % label >>
    83            beam.Map(lambda x: re.sub(r'\\', r'\\\\', x))
    84            | 'EscapeQuotes: %s' % label >>
    85            beam.Map(lambda x: re.sub(r'"', r'\"', x))
    86            | 'Split: %s' % label >> beam.Map(lambda x: re.split(r'\t+', x, 1)))
    87  
    88      # Read input databases.
    89      email = read_kv_textfile('email', known_args.input_email)
    90      phone = read_kv_textfile('phone', known_args.input_phone)
    91      snailmail = read_kv_textfile('snailmail', known_args.input_snailmail)
    92  
    93      # Group together all entries under the same name.
    94      grouped = (email, phone, snailmail) | 'group_by_name' >> beam.CoGroupByKey()
    95  
    96      # Prepare tab-delimited output; something like this:
    97      # "name"<TAB>"email_1,email_2"<TAB>"phone"<TAB>"first_snailmail_only"
    98      def format_as_tsv(name_email_phone_snailmail):
    99        (name, (email, phone, snailmail)) = name_email_phone_snailmail
   100        return '\t'.join([
   101            '"%s"' % name,
   102            '"%s"' % ','.join(email),
   103            '"%s"' % ','.join(phone),
   104            '"%s"' % next(iter(snailmail), '')
   105        ])
   106  
   107      tsv_lines = grouped | beam.Map(format_as_tsv)
   108  
   109      # Compute some stats about our database of people.
   110      def without_email(name_email_phone_snailmail):
   111        (_, (email, _, _)) = name_email_phone_snailmail
   112        return not next(iter(email), None)
   113  
   114      def without_phones(name_email_phone_snailmail):
   115        (_, (_, phone, _)) = name_email_phone_snailmail
   116        return not next(iter(phone), None)
   117  
   118      def without_address(name_email_phone_snailmail):
   119        (_, (_, _, snailmail)) = name_email_phone_snailmail
   120        return not next(iter(snailmail), None)
   121  
   122      luddites = grouped | beam.Filter(without_email)  # People without email.
   123      writers = grouped | beam.Filter(without_phones)  # People without phones.
   124      nomads = grouped | beam.Filter(without_address)  # People without addresses.
   125  
   126      num_luddites = luddites | 'Luddites' >> beam.combiners.Count.Globally()
   127      num_writers = writers | 'Writers' >> beam.combiners.Count.Globally()
   128      num_nomads = nomads | 'Nomads' >> beam.combiners.Count.Globally()
   129  
   130      # Write tab-delimited output.
   131      # pylint: disable=expression-not-assigned
   132      tsv_lines | 'WriteTsv' >> WriteToText(known_args.output_tsv)
   133  
   134      # TODO(silviuc): Move the assert_results logic to the unit test.
   135      if assert_results is not None:
   136        expected_luddites, expected_writers, expected_nomads = assert_results
   137        assert_that(
   138            num_luddites, equal_to([expected_luddites]), label='assert:luddites')
   139        assert_that(
   140            num_writers, equal_to([expected_writers]), label='assert:writers')
   141        assert_that(
   142            num_nomads, equal_to([expected_nomads]), label='assert:nomads')
   143  
   144  
   145  if __name__ == '__main__':
   146    logging.getLogger().setLevel(logging.INFO)
   147    run()