github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/wordcount_xlang.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A cross-language word-counting workflow."""
    19  
    20  # pytype: skip-file
    21  
    22  import argparse
    23  import logging
    24  import re
    25  import subprocess
    26  
    27  import grpc
    28  
    29  import apache_beam as beam
    30  from apache_beam.io import ReadFromText
    31  from apache_beam.io import WriteToText
    32  from apache_beam.options.pipeline_options import PipelineOptions
    33  from apache_beam.options.pipeline_options import SetupOptions
    34  
    35  # avoid possible conflict with job-server embedded expansion service at 8097
    36  EXPANSION_SERVICE_PORT = '8096'
    37  EXPANSION_SERVICE_ADDR = 'localhost:%s' % EXPANSION_SERVICE_PORT
    38  
    39  
    40  class WordExtractingDoFn(beam.DoFn):
    41    """Parse each line of input text into words."""
    42    def process(self, element):
    43      """Returns an iterator over the words of this element.
    44  
    45      The element is a line of text.  If the line is blank, note that, too.
    46  
    47      Args:
    48        element: the element being processed
    49  
    50      Returns:
    51        The processed element.
    52      """
    53      text_line = element.strip()
    54      return re.findall(r'[\w\']+', text_line)
    55  
    56  
    57  def build_pipeline(p, input_file, output_file):
    58    # Read the text file[pattern] into a PCollection.
    59    lines = p | 'read' >> ReadFromText(input_file)
    60  
    61    counts = (
    62        lines
    63        | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
    64        | 'count' >> beam.ExternalTransform(
    65            'beam:transforms:xlang:count', None, EXPANSION_SERVICE_ADDR))
    66  
    67    # Format the counts into a PCollection of strings.
    68    def format_result(word_count):
    69      (word, count) = word_count
    70      return '%s: %d' % (word, count)
    71  
    72    output = counts | 'format' >> beam.Map(format_result)
    73  
    74    # Write the output using a "Write" transform that has side effects.
    75    # pylint: disable=expression-not-assigned
    76    output | 'write' >> WriteToText(output_file)
    77  
    78  
    79  def main():
    80    logging.getLogger().setLevel(logging.INFO)
    81  
    82    parser = argparse.ArgumentParser()
    83    parser.add_argument(
    84        '--input',
    85        dest='input',
    86        default='gs://dataflow-samples/shakespeare/kinglear.txt',
    87        help='Input file to process.')
    88    parser.add_argument(
    89        '--output',
    90        dest='output',
    91        required=True,
    92        help='Output file to write results to.')
    93    parser.add_argument(
    94        '--expansion_service_jar',
    95        dest='expansion_service_jar',
    96        required=True,
    97        help='Jar file for expansion service')
    98  
    99    known_args, pipeline_args = parser.parse_known_args()
   100  
   101    pipeline_options = PipelineOptions(pipeline_args)
   102  
   103    # We use the save_main_session option because one or more DoFn's in this
   104    # workflow rely on global context (e.g., a module imported at module level).
   105    pipeline_options.view_as(SetupOptions).save_main_session = True
   106  
   107    try:
   108      server = subprocess.Popen([
   109          'java',
   110          '-jar',
   111          known_args.expansion_service_jar,
   112          EXPANSION_SERVICE_PORT
   113      ])
   114  
   115      with grpc.insecure_channel(EXPANSION_SERVICE_ADDR) as channel:
   116        grpc.channel_ready_future(channel).result()
   117  
   118      with beam.Pipeline(options=pipeline_options) as p:
   119        # Preemptively start due to BEAM-6666.
   120        p.runner.create_job_service(pipeline_options)
   121  
   122        build_pipeline(p, known_args.input, known_args.output)
   123  
   124    finally:
   125      server.kill()
   126  
   127  
   128  if __name__ == '__main__':
   129    main()