github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/bigquery_tornadoes.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A workflow using BigQuery sources and sinks.
    19  
    20  The workflow will read from a table that has the 'month' and 'tornado' fields as
    21  part of the table schema (other additional fields are ignored). The 'month'
    22  field is a number represented as a string (e.g., '23') and the 'tornado' field
    23  is a boolean field.
    24  
    25  The workflow will compute the number of tornadoes in each month and output
    26  the results to a table (created if needed) with the following schema:
    27  
    28  - month: number
    29  - tornado_count: number
    30  
    31  This example uses the default behavior for BigQuery source and sinks that
    32  represents table rows as plain Python dictionaries.
    33  """
    34  
    35  # pytype: skip-file
    36  
    37  import argparse
    38  import logging
    39  
    40  import apache_beam as beam
    41  
    42  
    43  def count_tornadoes(input_data):
    44    """Workflow computing the number of tornadoes for each month that had one.
    45  
    46    Args:
    47      input_data: a PCollection of dictionaries representing table rows. Each
    48        dictionary will have a 'month' and a 'tornado' key as described in the
    49        module comment.
    50  
    51    Returns:
    52      A PCollection of dictionaries containing 'month' and 'tornado_count' keys.
    53      Months without tornadoes are skipped.
    54    """
    55  
    56    return (
    57        input_data
    58        | 'months with tornadoes' >> beam.FlatMap(
    59            lambda row: [(int(row['month']), 1)] if row['tornado'] else [])
    60        | 'monthly count' >> beam.CombinePerKey(sum)
    61        | 'format' >>
    62        beam.Map(lambda k_v: {
    63            'month': k_v[0], 'tornado_count': k_v[1]
    64        }))
    65  
    66  
    67  def run(argv=None):
    68    parser = argparse.ArgumentParser()
    69    parser.add_argument(
    70        '--input',
    71        default='clouddataflow-readonly:samples.weather_stations',
    72        help=(
    73            'Input BigQuery table to process specified as: '
    74            'PROJECT:DATASET.TABLE or DATASET.TABLE.'))
    75    parser.add_argument(
    76        '--output',
    77        required=True,
    78        help=(
    79            'Output BigQuery table for results specified as: '
    80            'PROJECT:DATASET.TABLE or DATASET.TABLE.'))
    81  
    82    parser.add_argument(
    83        '--gcs_location',
    84        required=False,
    85        help=('GCS Location to store files to load '
    86              'data into Bigquery'))
    87  
    88    known_args, pipeline_args = parser.parse_known_args(argv)
    89  
    90    with beam.Pipeline(argv=pipeline_args) as p:
    91  
    92      # Read the table rows into a PCollection.
    93      rows = p | 'read' >> beam.io.ReadFromBigQuery(table=known_args.input)
    94      counts = count_tornadoes(rows)
    95  
    96      # Write the output using a "Write" transform that has side effects.
    97      # pylint: disable=expression-not-assigned
    98      counts | 'Write' >> beam.io.WriteToBigQuery(
    99          known_args.output,
   100          schema='month:INTEGER, tornado_count:INTEGER',
   101          create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
   102          write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
   103  
   104      # Run the pipeline (all operations are deferred until run() is called).
   105  
   106  
   107  if __name__ == '__main__':
   108    logging.getLogger().setLevel(logging.INFO)
   109    run()