github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/sql_taxi.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """An example that processes streaming NYC Taxi data with SqlTransform. 19 20 This example reads from the PubSub NYC Taxi stream described in 21 https://github.com/googlecodelabs/cloud-dataflow-nyc-taxi-tycoon, aggregates 22 the data in 15s windows using SqlTransform, and writes the output to 23 a user-defined PubSub topic. 24 25 A Java version supported by Beam must be installed locally to run this pipeline. 26 Additionally, Docker must also be available to run this pipeline locally. 27 """ 28 29 # pytype: skip-file 30 31 import json 32 import logging 33 34 import apache_beam as beam 35 from apache_beam.options.pipeline_options import PipelineOptions 36 from apache_beam.transforms.sql import SqlTransform 37 38 39 def run(output_topic, pipeline_args): 40 pipeline_options = PipelineOptions( 41 pipeline_args, save_main_session=True, streaming=True) 42 43 with beam.Pipeline(options=pipeline_options) as pipeline: 44 _ = ( 45 pipeline 46 | beam.io.ReadFromPubSub( 47 topic='projects/pubsub-public-data/topics/taxirides-realtime', 48 timestamp_attribute="ts").with_output_types(bytes) 49 | "Parse JSON payload" >> beam.Map(json.loads) 50 # Use beam.Row to create a schema-aware PCollection 51 | "Create beam Row" >> beam.Map( 52 lambda x: beam.Row( 53 ride_status=str(x['ride_status']), 54 passenger_count=int(x['passenger_count']))) 55 # SqlTransform will computes result within an existing window 56 | "15s fixed windows" >> beam.WindowInto(beam.window.FixedWindows(15)) 57 # Aggregate drop offs and pick ups that occur within each 15s window 58 | SqlTransform( 59 """ 60 SELECT 61 ride_status, 62 COUNT(*) AS num_rides, 63 SUM(passenger_count) AS total_passengers 64 FROM PCOLLECTION 65 WHERE NOT ride_status = 'enroute' 66 GROUP BY ride_status""") 67 # SqlTransform yields python objects with attributes corresponding to 68 # the outputs of the query. 69 # Collect those attributes, as well as window information, into a dict 70 | "Assemble Dictionary" >> beam.Map( 71 lambda row, 72 window=beam.DoFn.WindowParam: { 73 "ride_status": row.ride_status, 74 "num_rides": row.num_rides, 75 "total_passengers": row.total_passengers, 76 "window_start": window.start.to_rfc3339(), 77 "window_end": window.end.to_rfc3339() 78 }) 79 | "Convert to JSON" >> beam.Map(json.dumps) 80 | "UTF-8 encode" >> beam.Map(lambda s: s.encode("utf-8")) 81 | beam.io.WriteToPubSub(topic=output_topic)) 82 83 84 if __name__ == '__main__': 85 logging.getLogger().setLevel(logging.INFO) 86 import argparse 87 88 parser = argparse.ArgumentParser() 89 parser.add_argument( 90 '--output_topic', 91 dest='output_topic', 92 required=True, 93 help=( 94 'Cloud PubSub topic to write to (e.g. ' 95 'projects/my-project/topics/my-topic), must be created prior to ' 96 'running the pipeline.')) 97 known_args, pipeline_args = parser.parse_known_args() 98 99 run(known_args.output_topic, pipeline_args)