github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/kafkataxi/kafka_taxi.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """An example that writes to and reads from Kafka. 19 20 This example reads from the PubSub NYC Taxi stream described in 21 https://github.com/googlecodelabs/cloud-dataflow-nyc-taxi-tycoon, writes to a 22 given Kafka topic and reads back from the same Kafka topic. 23 """ 24 25 # pytype: skip-file 26 27 import logging 28 import sys 29 import typing 30 31 import apache_beam as beam 32 from apache_beam.io.kafka import ReadFromKafka 33 from apache_beam.io.kafka import WriteToKafka 34 from apache_beam.options.pipeline_options import GoogleCloudOptions 35 from apache_beam.options.pipeline_options import PipelineOptions 36 37 38 def run( 39 bootstrap_servers, 40 topic, 41 with_metadata, 42 bq_dataset, 43 bq_table_name, 44 project, 45 pipeline_options): 46 # bootstrap_servers = '123.45.67.89:123:9092' 47 # topic = 'kafka_taxirides_realtime' 48 # pipeline_args = ['--project', 'my-project', 49 # '--runner', 'DataflowRunner', 50 # '--temp_location', 'my-temp-location', 51 # '--region', 'my-region', 52 # '--num_workers', 'my-num-workers'] 53 54 window_size = 15 # size of the Window in seconds. 55 56 def log_ride(ride): 57 if 'timestamp' in ride: 58 logging.info( 59 'Found ride at latitude %r and longitude %r with %r ' 60 'passengers at timestamp %r', 61 ride['latitude'], 62 ride['longitude'], 63 ride['passenger_count'], 64 ride['timestamp']) 65 else: 66 logging.info( 67 'Found ride at latitude %r and longitude %r with %r ' 68 'passengers', 69 ride['latitude'], 70 ride['longitude'], 71 ride['passenger_count']) 72 73 def convert_kafka_record_to_dictionary(record): 74 # the records have 'value' attribute when --with_metadata is given 75 if hasattr(record, 'value'): 76 ride_bytes = record.value 77 elif isinstance(record, tuple): 78 ride_bytes = record[1] 79 else: 80 raise RuntimeError('unknown record type: %s' % type(record)) 81 # Converting bytes record from Kafka to a dictionary. 82 import ast 83 ride = ast.literal_eval(ride_bytes.decode("UTF-8")) 84 output = { 85 key: ride[key] 86 for key in ['latitude', 'longitude', 'passenger_count'] 87 } 88 if hasattr(record, 'timestamp'): 89 # timestamp is read from Kafka metadata 90 output['timestamp'] = record.timestamp 91 return output 92 93 with beam.Pipeline(options=pipeline_options) as pipeline: 94 _ = ( 95 pipeline 96 | beam.io.ReadFromPubSub( 97 topic='projects/pubsub-public-data/topics/taxirides-realtime'). 98 with_output_types(bytes) 99 | beam.Map(lambda x: (b'', x)).with_output_types( 100 typing.Tuple[bytes, bytes]) # Kafka write transforms expects KVs. 101 | beam.WindowInto(beam.window.FixedWindows(window_size)) 102 | WriteToKafka( 103 producer_config={'bootstrap.servers': bootstrap_servers}, 104 topic=topic)) 105 106 ride_col = ( 107 pipeline 108 | ReadFromKafka( 109 consumer_config={'bootstrap.servers': bootstrap_servers}, 110 topics=[topic], 111 with_metadata=with_metadata) 112 | beam.Map(lambda record: convert_kafka_record_to_dictionary(record))) 113 114 if bq_dataset: 115 schema = 'latitude:STRING,longitude:STRING,passenger_count:INTEGER' 116 if with_metadata: 117 schema += ',timestamp:STRING' 118 _ = ( 119 ride_col 120 | beam.io.WriteToBigQuery(bq_table_name, bq_dataset, project, schema)) 121 else: 122 _ = ride_col | beam.FlatMap(lambda ride: log_ride(ride)) 123 124 125 if __name__ == '__main__': 126 logging.getLogger().setLevel(logging.INFO) 127 import argparse 128 129 parser = argparse.ArgumentParser() 130 parser.add_argument( 131 '--bootstrap_servers', 132 dest='bootstrap_servers', 133 required=True, 134 help='Bootstrap servers for the Kafka cluster. Should be accessible by ' 135 'the runner') 136 parser.add_argument( 137 '--topic', 138 dest='topic', 139 default='kafka_taxirides_realtime', 140 help='Kafka topic to write to and read from') 141 parser.add_argument( 142 '--with_metadata', 143 default=False, 144 action='store_true', 145 help='If set, also reads metadata from the Kafka broker.') 146 parser.add_argument( 147 '--bq_dataset', 148 type=str, 149 default='', 150 help='BigQuery Dataset to write tables to. ' 151 'If set, export data to a BigQuery table instead of just logging. ' 152 'Must already exist.') 153 parser.add_argument( 154 '--bq_table_name', 155 default='xlang_kafka_taxi', 156 help='The BigQuery table name. Should not already exist.') 157 known_args, pipeline_args = parser.parse_known_args() 158 159 pipeline_options = PipelineOptions( 160 pipeline_args, save_main_session=True, streaming=True) 161 162 # We also require the --project option to access --bq_dataset 163 project = pipeline_options.view_as(GoogleCloudOptions).project 164 if project is None: 165 parser.print_usage() 166 print(sys.argv[0] + ': error: argument --project is required') 167 sys.exit(1) 168 169 run( 170 known_args.bootstrap_servers, 171 known_args.topic, 172 known_args.with_metadata, 173 known_args.bq_dataset, 174 known_args.bq_table_name, 175 project, 176 pipeline_options)