github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/wordcount_minimal.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A minimalist word-counting workflow that counts words in Shakespeare. 19 20 This is the first in a series of successively more detailed 'word count' 21 examples. 22 23 Next, see the wordcount pipeline, then the wordcount_debugging pipeline, for 24 more detailed examples that introduce additional concepts. 25 26 Concepts: 27 28 1. Reading data from text files 29 2. Specifying 'inline' transforms 30 3. Counting a PCollection 31 4. Writing data to Cloud Storage as text files 32 33 To execute this pipeline locally, first edit the code to specify the output 34 location. Output location could be a local file path or an output prefix 35 on GCS. (Only update the output location marked with the first CHANGE comment.) 36 37 To execute this pipeline remotely, first edit the code to set your project ID, 38 runner type, the staging location, the temp location, and the output location. 39 The specified GCS bucket(s) must already exist. (Update all the places marked 40 with a CHANGE comment.) 41 42 Then, run the pipeline as described in the README. It will be deployed and run 43 using the Google Cloud Dataflow Service. No args are required to run the 44 pipeline. You can see the results in your output bucket in the GCS browser. 45 """ 46 47 # pytype: skip-file 48 49 # beam-playground: 50 # name: WordCountMinimal 51 # description: An example that counts words in Shakespeare's works. 52 # multifile: false 53 # pipeline_options: --output output.txt 54 # context_line: 74 55 # categories: 56 # - IO 57 # - Core Transforms 58 # - Flatten 59 # - Options 60 # - Combiners 61 # - Quickstart 62 # complexity: MEDIUM 63 # tags: 64 # - count 65 # - strings 66 # - hellobeam 67 68 import argparse 69 import logging 70 import re 71 72 import apache_beam as beam 73 from apache_beam.io import ReadFromText 74 from apache_beam.io import WriteToText 75 from apache_beam.options.pipeline_options import PipelineOptions 76 from apache_beam.options.pipeline_options import SetupOptions 77 78 79 def main(argv=None, save_main_session=True): 80 """Main entry point; defines and runs the wordcount pipeline.""" 81 82 parser = argparse.ArgumentParser() 83 parser.add_argument( 84 '--input', 85 dest='input', 86 default='gs://dataflow-samples/shakespeare/kinglear.txt', 87 help='Input file to process.') 88 parser.add_argument( 89 '--output', 90 dest='output', 91 # CHANGE 1/6: (OPTIONAL) The Google Cloud Storage path is required 92 # for outputting the results. 93 default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX', 94 help='Output file to write results to.') 95 96 # If you use DataflowRunner, below options can be passed: 97 # CHANGE 2/6: (OPTIONAL) Change this to DataflowRunner to 98 # run your pipeline on the Google Cloud Dataflow Service. 99 # '--runner=DirectRunner', 100 # CHANGE 3/6: (OPTIONAL) Your project ID is required in order to 101 # run your pipeline on the Google Cloud Dataflow Service. 102 # '--project=SET_YOUR_PROJECT_ID_HERE', 103 # CHANGE 4/6: (OPTIONAL) The Google Cloud region (e.g. us-central1) 104 # is required in order to run your pipeline on the Google Cloud 105 # Dataflow Service. 106 # '--region=SET_REGION_HERE', 107 # CHANGE 5/6: Your Google Cloud Storage path is required for staging local 108 # files. 109 # '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY', 110 # CHANGE 6/6: Your Google Cloud Storage path is required for temporary 111 # files. 112 # '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY', 113 # '--job_name=your-wordcount-job', 114 known_args, pipeline_args = parser.parse_known_args(argv) 115 116 # We use the save_main_session option because one or more DoFn's in this 117 # workflow rely on global context (e.g., a module imported at module level). 118 pipeline_options = PipelineOptions(pipeline_args) 119 pipeline_options.view_as(SetupOptions).save_main_session = save_main_session 120 with beam.Pipeline(options=pipeline_options) as p: 121 122 # Read the text file[pattern] into a PCollection. 123 lines = p | ReadFromText(known_args.input) 124 125 # Count the occurrences of each word. 126 counts = ( 127 lines 128 | 'Split' >> ( 129 beam.FlatMap( 130 lambda x: re.findall(r'[A-Za-z\']+', x)).with_output_types(str)) 131 | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) 132 | 'GroupAndSum' >> beam.CombinePerKey(sum)) 133 134 # Format the counts into a PCollection of strings. 135 def format_result(word_count): 136 (word, count) = word_count 137 return '%s: %s' % (word, count) 138 139 output = counts | 'Format' >> beam.Map(format_result) 140 141 # Write the output using a "Write" transform that has side effects. 142 # pylint: disable=expression-not-assigned 143 output | WriteToText(known_args.output) 144 145 146 if __name__ == '__main__': 147 logging.getLogger().setLevel(logging.INFO) 148 main()