github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/wordcount_xlang_sql.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A word-counting workflow that uses the SQL transform. 19 20 A Java version supported by Beam must be installed locally to run this pipeline. 21 Additionally, Docker must also be available to run this pipeline locally. 22 """ 23 24 import argparse 25 import logging 26 import re 27 import typing 28 29 import apache_beam as beam 30 from apache_beam import coders 31 from apache_beam.io import ReadFromText 32 from apache_beam.io import WriteToText 33 from apache_beam.options.pipeline_options import PipelineOptions 34 from apache_beam.options.pipeline_options import SetupOptions 35 from apache_beam.runners.portability import portable_runner 36 from apache_beam.transforms.sql import SqlTransform 37 38 # The input to SqlTransform must be a PCollection(s) of known schema. 39 # One way to create such a PCollection is to produce a PCollection of 40 # NamedTuple registered with the RowCoder. 41 # 42 # Here we create and register a simple NamedTuple with a single str typed 43 # field named 'word' which we will use below. 44 MyRow = typing.NamedTuple('MyRow', [('word', str)]) 45 coders.registry.register_coder(MyRow, coders.RowCoder) 46 47 48 def run(p, input_file, output_file): 49 #pylint: disable=expression-not-assigned 50 ( 51 p 52 # Read the lines from a text file. 53 | 'Read' >> ReadFromText(input_file) 54 # Split the line into individual words. 55 | 'Split' >> beam.FlatMap(lambda line: re.split(r'\W+', line)) 56 # Map each word to an instance of MyRow. 57 | 'ToRow' >> beam.Map(MyRow).with_output_types(MyRow) 58 # SqlTransform yields a PCollection containing elements with attributes 59 # based on the output of the query. 60 | 'Sql!!' >> SqlTransform( 61 """ 62 SELECT 63 word as key, 64 COUNT(*) as `count` 65 FROM PCOLLECTION 66 GROUP BY word""") 67 | 'Format' >> beam.Map(lambda row: '{}: {}'.format(row.key, row.count)) 68 | 'Write' >> WriteToText(output_file)) 69 70 71 def main(): 72 logging.getLogger().setLevel(logging.INFO) 73 74 parser = argparse.ArgumentParser() 75 parser.add_argument( 76 '--input', 77 dest='input', 78 default='gs://dataflow-samples/shakespeare/kinglear.txt', 79 help='Input file to process.') 80 parser.add_argument( 81 '--output', 82 dest='output', 83 required=True, 84 help='Output file to write results to.') 85 86 known_args, pipeline_args = parser.parse_known_args() 87 88 pipeline_options = PipelineOptions(pipeline_args) 89 90 # We use the save_main_session option because one or more DoFn's in this 91 # workflow rely on global context (e.g., a module imported at module level). 92 pipeline_options.view_as(SetupOptions).save_main_session = True 93 94 with beam.Pipeline(options=pipeline_options) as p: 95 if isinstance(p.runner, portable_runner.PortableRunner): 96 # Preemptively start due to BEAM-6666. 97 p.runner.create_job_service(pipeline_options) 98 99 run(p, known_args.input, known_args.output) 100 101 102 # Some more fun queries: 103 # ------ 104 # SELECT 105 # word as key, 106 # COUNT(*) as `count` 107 # FROM PCOLLECTION 108 # GROUP BY word 109 # ORDER BY `count` DESC 110 # LIMIT 100 111 # ------ 112 # SELECT 113 # len as key, 114 # COUNT(*) as `count` 115 # FROM ( 116 # SELECT 117 # LENGTH(word) AS len 118 # FROM PCOLLECTION 119 # ) 120 # GROUP BY len 121 122 if __name__ == '__main__': 123 main()