github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/kafkataxi/kafka_taxi.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/kafkataxi/kafka_taxi.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """An example that writes to and reads from Kafka.
    19  
    20   This example reads from the PubSub NYC Taxi stream described in
    21   https://github.com/googlecodelabs/cloud-dataflow-nyc-taxi-tycoon, writes to a
    22   given Kafka topic and reads back from the same Kafka topic.
    23   """
    24  
    25  # pytype: skip-file
    26  
    27  import logging
    28  import sys
    29  import typing
    30  
    31  import apache_beam as beam
    32  from apache_beam.io.kafka import ReadFromKafka
    33  from apache_beam.io.kafka import WriteToKafka
    34  from apache_beam.options.pipeline_options import GoogleCloudOptions
    35  from apache_beam.options.pipeline_options import PipelineOptions
    36  
    37  
    38  def run(
    39      bootstrap_servers,
    40      topic,
    41      with_metadata,
    42      bq_dataset,
    43      bq_table_name,
    44      project,
    45      pipeline_options):
    46    # bootstrap_servers = '123.45.67.89:123:9092'
    47    # topic = 'kafka_taxirides_realtime'
    48    # pipeline_args = ['--project', 'my-project',
    49    #                  '--runner', 'DataflowRunner',
    50    #                  '--temp_location', 'my-temp-location',
    51    #                  '--region', 'my-region',
    52    #                  '--num_workers', 'my-num-workers']
    53  
    54    window_size = 15  # size of the Window in seconds.
    55  
    56    def log_ride(ride):
    57      if 'timestamp' in ride:
    58        logging.info(
    59            'Found ride at latitude %r and longitude %r with %r '
    60            'passengers at timestamp %r',
    61            ride['latitude'],
    62            ride['longitude'],
    63            ride['passenger_count'],
    64            ride['timestamp'])
    65      else:
    66        logging.info(
    67            'Found ride at latitude %r and longitude %r with %r '
    68            'passengers',
    69            ride['latitude'],
    70            ride['longitude'],
    71            ride['passenger_count'])
    72  
    73    def convert_kafka_record_to_dictionary(record):
    74      # the records have 'value' attribute when --with_metadata is given
    75      if hasattr(record, 'value'):
    76        ride_bytes = record.value
    77      elif isinstance(record, tuple):
    78        ride_bytes = record[1]
    79      else:
    80        raise RuntimeError('unknown record type: %s' % type(record))
    81      # Converting bytes record from Kafka to a dictionary.
    82      import ast
    83      ride = ast.literal_eval(ride_bytes.decode("UTF-8"))
    84      output = {
    85          key: ride[key]
    86          for key in ['latitude', 'longitude', 'passenger_count']
    87      }
    88      if hasattr(record, 'timestamp'):
    89        # timestamp is read from Kafka metadata
    90        output['timestamp'] = record.timestamp
    91      return output
    92  
    93    with beam.Pipeline(options=pipeline_options) as pipeline:
    94      _ = (
    95          pipeline
    96          | beam.io.ReadFromPubSub(
    97              topic='projects/pubsub-public-data/topics/taxirides-realtime').
    98          with_output_types(bytes)
    99          | beam.Map(lambda x: (b'', x)).with_output_types(
   100              typing.Tuple[bytes, bytes])  # Kafka write transforms expects KVs.
   101          | beam.WindowInto(beam.window.FixedWindows(window_size))
   102          | WriteToKafka(
   103              producer_config={'bootstrap.servers': bootstrap_servers},
   104              topic=topic))
   105  
   106      ride_col = (
   107          pipeline
   108          | ReadFromKafka(
   109              consumer_config={'bootstrap.servers': bootstrap_servers},
   110              topics=[topic],
   111              with_metadata=with_metadata)
   112          | beam.Map(lambda record: convert_kafka_record_to_dictionary(record)))
   113  
   114      if bq_dataset:
   115        schema = 'latitude:STRING,longitude:STRING,passenger_count:INTEGER'
   116        if with_metadata:
   117          schema += ',timestamp:STRING'
   118        _ = (
   119            ride_col
   120            | beam.io.WriteToBigQuery(bq_table_name, bq_dataset, project, schema))
   121      else:
   122        _ = ride_col | beam.FlatMap(lambda ride: log_ride(ride))
   123  
   124  
   125  if __name__ == '__main__':
   126    logging.getLogger().setLevel(logging.INFO)
   127    import argparse
   128  
   129    parser = argparse.ArgumentParser()
   130    parser.add_argument(
   131        '--bootstrap_servers',
   132        dest='bootstrap_servers',
   133        required=True,
   134        help='Bootstrap servers for the Kafka cluster. Should be accessible by '
   135        'the runner')
   136    parser.add_argument(
   137        '--topic',
   138        dest='topic',
   139        default='kafka_taxirides_realtime',
   140        help='Kafka topic to write to and read from')
   141    parser.add_argument(
   142        '--with_metadata',
   143        default=False,
   144        action='store_true',
   145        help='If set, also reads metadata from the Kafka broker.')
   146    parser.add_argument(
   147        '--bq_dataset',
   148        type=str,
   149        default='',
   150        help='BigQuery Dataset to write tables to. '
   151        'If set, export data to a BigQuery table instead of just logging. '
   152        'Must already exist.')
   153    parser.add_argument(
   154        '--bq_table_name',
   155        default='xlang_kafka_taxi',
   156        help='The BigQuery table name. Should not already exist.')
   157    known_args, pipeline_args = parser.parse_known_args()
   158  
   159    pipeline_options = PipelineOptions(
   160        pipeline_args, save_main_session=True, streaming=True)
   161  
   162    # We also require the --project option to access --bq_dataset
   163    project = pipeline_options.view_as(GoogleCloudOptions).project
   164    if project is None:
   165      parser.print_usage()
   166      print(sys.argv[0] + ': error: argument --project is required')
   167      sys.exit(1)
   168  
   169    run(
   170        known_args.bootstrap_servers,
   171        known_args.topic,
   172        known_args.with_metadata,
   173        known_args.bq_dataset,
   174        known_args.bq_table_name,
   175        project,
   176        pipeline_options)