github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/dataframe/wordcount.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A word-counting workflow using the DataFrame API.""" 19 20 # pytype: skip-file 21 22 import argparse 23 import logging 24 25 import apache_beam as beam 26 from apache_beam.dataframe.convert import to_dataframe 27 from apache_beam.dataframe.convert import to_pcollection 28 from apache_beam.io import ReadFromText 29 from apache_beam.options.pipeline_options import PipelineOptions 30 31 32 def run(argv=None): 33 """Main entry point; defines and runs the wordcount pipeline.""" 34 parser = argparse.ArgumentParser( 35 formatter_class=argparse.ArgumentDefaultsHelpFormatter) 36 parser.add_argument( 37 '--input', 38 dest='input', 39 default='gs://dataflow-samples/shakespeare/kinglear.txt', 40 help='Input file to process.') 41 parser.add_argument( 42 '--output', 43 dest='output', 44 required=True, 45 help='Output file to write results to.') 46 known_args, pipeline_args = parser.parse_known_args(argv) 47 48 # Import this here to avoid pickling the main session. 49 import re 50 51 # The pipeline will be run on exiting the with block. 52 with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p: 53 54 # [START DataFrame_wordcount] 55 56 # Read the text file[pattern] into a PCollection. 57 lines = p | 'Read' >> ReadFromText(known_args.input) 58 59 words = ( 60 lines 61 | 'Split' >> beam.FlatMap( 62 lambda line: re.findall(r'[\w]+', line)).with_output_types(str) 63 # Map to Row objects to generate a schema suitable for conversion 64 # to a dataframe. 65 | 'ToRows' >> beam.Map(lambda word: beam.Row(word=word))) 66 67 df = to_dataframe(words) 68 df['count'] = 1 69 counted = df.groupby('word').sum() 70 counted.to_csv(known_args.output) 71 72 # Deferred DataFrames can also be converted back to schema'd PCollections 73 counted_pc = to_pcollection(counted, include_indexes=True) 74 75 # [END DataFrame_wordcount] 76 77 # Print out every word that occurred >50 times 78 _ = ( 79 counted_pc 80 | beam.Filter(lambda row: row.count > 50) 81 | beam.Map(lambda row: f'{row.word}: {row.count}') 82 | beam.Map(print)) 83 84 85 if __name__ == '__main__': 86 logging.getLogger().setLevel(logging.INFO) 87 run()