github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/wordcount_xlang.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A cross-language word-counting workflow.""" 19 20 # pytype: skip-file 21 22 import argparse 23 import logging 24 import re 25 import subprocess 26 27 import grpc 28 29 import apache_beam as beam 30 from apache_beam.io import ReadFromText 31 from apache_beam.io import WriteToText 32 from apache_beam.options.pipeline_options import PipelineOptions 33 from apache_beam.options.pipeline_options import SetupOptions 34 35 # avoid possible conflict with job-server embedded expansion service at 8097 36 EXPANSION_SERVICE_PORT = '8096' 37 EXPANSION_SERVICE_ADDR = 'localhost:%s' % EXPANSION_SERVICE_PORT 38 39 40 class WordExtractingDoFn(beam.DoFn): 41 """Parse each line of input text into words.""" 42 def process(self, element): 43 """Returns an iterator over the words of this element. 44 45 The element is a line of text. If the line is blank, note that, too. 46 47 Args: 48 element: the element being processed 49 50 Returns: 51 The processed element. 52 """ 53 text_line = element.strip() 54 return re.findall(r'[\w\']+', text_line) 55 56 57 def build_pipeline(p, input_file, output_file): 58 # Read the text file[pattern] into a PCollection. 59 lines = p | 'read' >> ReadFromText(input_file) 60 61 counts = ( 62 lines 63 | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str)) 64 | 'count' >> beam.ExternalTransform( 65 'beam:transforms:xlang:count', None, EXPANSION_SERVICE_ADDR)) 66 67 # Format the counts into a PCollection of strings. 68 def format_result(word_count): 69 (word, count) = word_count 70 return '%s: %d' % (word, count) 71 72 output = counts | 'format' >> beam.Map(format_result) 73 74 # Write the output using a "Write" transform that has side effects. 75 # pylint: disable=expression-not-assigned 76 output | 'write' >> WriteToText(output_file) 77 78 79 def main(): 80 logging.getLogger().setLevel(logging.INFO) 81 82 parser = argparse.ArgumentParser() 83 parser.add_argument( 84 '--input', 85 dest='input', 86 default='gs://dataflow-samples/shakespeare/kinglear.txt', 87 help='Input file to process.') 88 parser.add_argument( 89 '--output', 90 dest='output', 91 required=True, 92 help='Output file to write results to.') 93 parser.add_argument( 94 '--expansion_service_jar', 95 dest='expansion_service_jar', 96 required=True, 97 help='Jar file for expansion service') 98 99 known_args, pipeline_args = parser.parse_known_args() 100 101 pipeline_options = PipelineOptions(pipeline_args) 102 103 # We use the save_main_session option because one or more DoFn's in this 104 # workflow rely on global context (e.g., a module imported at module level). 105 pipeline_options.view_as(SetupOptions).save_main_session = True 106 107 try: 108 server = subprocess.Popen([ 109 'java', 110 '-jar', 111 known_args.expansion_service_jar, 112 EXPANSION_SERVICE_PORT 113 ]) 114 115 with grpc.insecure_channel(EXPANSION_SERVICE_ADDR) as channel: 116 grpc.channel_ready_future(channel).result() 117 118 with beam.Pipeline(options=pipeline_options) as p: 119 # Preemptively start due to BEAM-6666. 120 p.runner.create_job_service(pipeline_options) 121 122 build_pipeline(p, known_args.input, known_args.output) 123 124 finally: 125 server.kill() 126 127 128 if __name__ == '__main__': 129 main()