github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/bigquery_tornadoes.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A workflow using BigQuery sources and sinks. 19 20 The workflow will read from a table that has the 'month' and 'tornado' fields as 21 part of the table schema (other additional fields are ignored). The 'month' 22 field is a number represented as a string (e.g., '23') and the 'tornado' field 23 is a boolean field. 24 25 The workflow will compute the number of tornadoes in each month and output 26 the results to a table (created if needed) with the following schema: 27 28 - month: number 29 - tornado_count: number 30 31 This example uses the default behavior for BigQuery source and sinks that 32 represents table rows as plain Python dictionaries. 33 """ 34 35 # pytype: skip-file 36 37 import argparse 38 import logging 39 40 import apache_beam as beam 41 42 43 def count_tornadoes(input_data): 44 """Workflow computing the number of tornadoes for each month that had one. 45 46 Args: 47 input_data: a PCollection of dictionaries representing table rows. Each 48 dictionary will have a 'month' and a 'tornado' key as described in the 49 module comment. 50 51 Returns: 52 A PCollection of dictionaries containing 'month' and 'tornado_count' keys. 53 Months without tornadoes are skipped. 54 """ 55 56 return ( 57 input_data 58 | 'months with tornadoes' >> beam.FlatMap( 59 lambda row: [(int(row['month']), 1)] if row['tornado'] else []) 60 | 'monthly count' >> beam.CombinePerKey(sum) 61 | 'format' >> 62 beam.Map(lambda k_v: { 63 'month': k_v[0], 'tornado_count': k_v[1] 64 })) 65 66 67 def run(argv=None): 68 parser = argparse.ArgumentParser() 69 parser.add_argument( 70 '--input', 71 default='clouddataflow-readonly:samples.weather_stations', 72 help=( 73 'Input BigQuery table to process specified as: ' 74 'PROJECT:DATASET.TABLE or DATASET.TABLE.')) 75 parser.add_argument( 76 '--output', 77 required=True, 78 help=( 79 'Output BigQuery table for results specified as: ' 80 'PROJECT:DATASET.TABLE or DATASET.TABLE.')) 81 82 parser.add_argument( 83 '--gcs_location', 84 required=False, 85 help=('GCS Location to store files to load ' 86 'data into Bigquery')) 87 88 known_args, pipeline_args = parser.parse_known_args(argv) 89 90 with beam.Pipeline(argv=pipeline_args) as p: 91 92 # Read the table rows into a PCollection. 93 rows = p | 'read' >> beam.io.ReadFromBigQuery(table=known_args.input) 94 counts = count_tornadoes(rows) 95 96 # Write the output using a "Write" transform that has side effects. 97 # pylint: disable=expression-not-assigned 98 counts | 'Write' >> beam.io.WriteToBigQuery( 99 known_args.output, 100 schema='month:INTEGER, tornado_count:INTEGER', 101 create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 102 write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) 103 104 # Run the pipeline (all operations are deferred until run() is called). 105 106 107 if __name__ == '__main__': 108 logging.getLogger().setLevel(logging.INFO) 109 run()