github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/sql_taxi.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """An example that processes streaming NYC Taxi data with SqlTransform.
    19  
    20  This example reads from the PubSub NYC Taxi stream described in
    21  https://github.com/googlecodelabs/cloud-dataflow-nyc-taxi-tycoon, aggregates
    22  the data in 15s windows using SqlTransform, and writes the output to
    23  a user-defined PubSub topic.
    24  
    25  A Java version supported by Beam must be installed locally to run this pipeline.
    26  Additionally, Docker must also be available to run this pipeline locally.
    27  """
    28  
    29  # pytype: skip-file
    30  
    31  import json
    32  import logging
    33  
    34  import apache_beam as beam
    35  from apache_beam.options.pipeline_options import PipelineOptions
    36  from apache_beam.transforms.sql import SqlTransform
    37  
    38  
    39  def run(output_topic, pipeline_args):
    40    pipeline_options = PipelineOptions(
    41        pipeline_args, save_main_session=True, streaming=True)
    42  
    43    with beam.Pipeline(options=pipeline_options) as pipeline:
    44      _ = (
    45          pipeline
    46          | beam.io.ReadFromPubSub(
    47              topic='projects/pubsub-public-data/topics/taxirides-realtime',
    48              timestamp_attribute="ts").with_output_types(bytes)
    49          | "Parse JSON payload" >> beam.Map(json.loads)
    50          # Use beam.Row to create a schema-aware PCollection
    51          | "Create beam Row" >> beam.Map(
    52              lambda x: beam.Row(
    53                  ride_status=str(x['ride_status']),
    54                  passenger_count=int(x['passenger_count'])))
    55          # SqlTransform will computes result within an existing window
    56          | "15s fixed windows" >> beam.WindowInto(beam.window.FixedWindows(15))
    57          # Aggregate drop offs and pick ups that occur within each 15s window
    58          | SqlTransform(
    59              """
    60               SELECT
    61                 ride_status,
    62                 COUNT(*) AS num_rides,
    63                 SUM(passenger_count) AS total_passengers
    64               FROM PCOLLECTION
    65               WHERE NOT ride_status = 'enroute'
    66               GROUP BY ride_status""")
    67          # SqlTransform yields python objects with attributes corresponding to
    68          # the outputs of the query.
    69          # Collect those attributes, as well as window information, into a dict
    70          | "Assemble Dictionary" >> beam.Map(
    71              lambda row,
    72              window=beam.DoFn.WindowParam: {
    73                  "ride_status": row.ride_status,
    74                  "num_rides": row.num_rides,
    75                  "total_passengers": row.total_passengers,
    76                  "window_start": window.start.to_rfc3339(),
    77                  "window_end": window.end.to_rfc3339()
    78              })
    79          | "Convert to JSON" >> beam.Map(json.dumps)
    80          | "UTF-8 encode" >> beam.Map(lambda s: s.encode("utf-8"))
    81          | beam.io.WriteToPubSub(topic=output_topic))
    82  
    83  
    84  if __name__ == '__main__':
    85    logging.getLogger().setLevel(logging.INFO)
    86    import argparse
    87  
    88    parser = argparse.ArgumentParser()
    89    parser.add_argument(
    90        '--output_topic',
    91        dest='output_topic',
    92        required=True,
    93        help=(
    94            'Cloud PubSub topic to write to (e.g. '
    95            'projects/my-project/topics/my-topic), must be created prior to '
    96            'running the pipeline.'))
    97    known_args, pipeline_args = parser.parse_known_args()
    98  
    99    run(known_args.output_topic, pipeline_args)