github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/wordcount_xlang_sql.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/wordcount_xlang_sql.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A word-counting workflow that uses the SQL transform.
    19  
    20  A Java version supported by Beam must be installed locally to run this pipeline.
    21  Additionally, Docker must also be available to run this pipeline locally.
    22  """
    23  
    24  import argparse
    25  import logging
    26  import re
    27  import typing
    28  
    29  import apache_beam as beam
    30  from apache_beam import coders
    31  from apache_beam.io import ReadFromText
    32  from apache_beam.io import WriteToText
    33  from apache_beam.options.pipeline_options import PipelineOptions
    34  from apache_beam.options.pipeline_options import SetupOptions
    35  from apache_beam.runners.portability import portable_runner
    36  from apache_beam.transforms.sql import SqlTransform
    37  
    38  # The input to SqlTransform must be a PCollection(s) of known schema.
    39  # One way to create such a PCollection is to produce a PCollection of
    40  # NamedTuple registered with the RowCoder.
    41  #
    42  # Here we create and register a simple NamedTuple with a single str typed
    43  # field named 'word' which we will use below.
    44  MyRow = typing.NamedTuple('MyRow', [('word', str)])
    45  coders.registry.register_coder(MyRow, coders.RowCoder)
    46  
    47  
    48  def run(p, input_file, output_file):
    49    #pylint: disable=expression-not-assigned
    50    (
    51        p
    52        # Read the lines from a text file.
    53        | 'Read' >> ReadFromText(input_file)
    54        # Split the line into individual words.
    55        | 'Split' >> beam.FlatMap(lambda line: re.split(r'\W+', line))
    56        # Map each word to an instance of MyRow.
    57        | 'ToRow' >> beam.Map(MyRow).with_output_types(MyRow)
    58        # SqlTransform yields a PCollection containing elements with attributes
    59        # based on the output of the query.
    60        | 'Sql!!' >> SqlTransform(
    61            """
    62                     SELECT
    63                       word as key,
    64                       COUNT(*) as `count`
    65                     FROM PCOLLECTION
    66                     GROUP BY word""")
    67        | 'Format' >> beam.Map(lambda row: '{}: {}'.format(row.key, row.count))
    68        | 'Write' >> WriteToText(output_file))
    69  
    70  
    71  def main():
    72    logging.getLogger().setLevel(logging.INFO)
    73  
    74    parser = argparse.ArgumentParser()
    75    parser.add_argument(
    76        '--input',
    77        dest='input',
    78        default='gs://dataflow-samples/shakespeare/kinglear.txt',
    79        help='Input file to process.')
    80    parser.add_argument(
    81        '--output',
    82        dest='output',
    83        required=True,
    84        help='Output file to write results to.')
    85  
    86    known_args, pipeline_args = parser.parse_known_args()
    87  
    88    pipeline_options = PipelineOptions(pipeline_args)
    89  
    90    # We use the save_main_session option because one or more DoFn's in this
    91    # workflow rely on global context (e.g., a module imported at module level).
    92    pipeline_options.view_as(SetupOptions).save_main_session = True
    93  
    94    with beam.Pipeline(options=pipeline_options) as p:
    95      if isinstance(p.runner, portable_runner.PortableRunner):
    96        # Preemptively start due to BEAM-6666.
    97        p.runner.create_job_service(pipeline_options)
    98  
    99      run(p, known_args.input, known_args.output)
   100  
   101  
   102  # Some more fun queries:
   103  # ------
   104  # SELECT
   105  #   word as key,
   106  #   COUNT(*) as `count`
   107  # FROM PCOLLECTION
   108  # GROUP BY word
   109  # ORDER BY `count` DESC
   110  # LIMIT 100
   111  # ------
   112  # SELECT
   113  #   len as key,
   114  #   COUNT(*) as `count`
   115  # FROM (
   116  #   SELECT
   117  #     LENGTH(word) AS len
   118  #   FROM PCOLLECTION
   119  # )
   120  # GROUP BY len
   121  
   122  if __name__ == '__main__':
   123    main()