github.com/apache/beam/sdks/v2@v2.48.2/go/examples/cookbook/tornadoes/tornadoes.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one or more
     2  // contributor license agreements.  See the NOTICE file distributed with
     3  // this work for additional information regarding copyright ownership.
     4  // The ASF licenses this file to You under the Apache License, Version 2.0
     5  // (the "License"); you may not use this file except in compliance with
     6  // the License.  You may obtain a copy of the License at
     7  //
     8  //    http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  
    16  // tornadoes is an example that reads the public samples of weather data from
    17  // BigQuery, counts the number of tornadoes that occur in each month, and
    18  // writes the results to BigQuery.
    19  //
    20  // Concepts: Reading/writing BigQuery; Using Go types for better type-safety.
    21  //
    22  // Note: Before running this example, you must create a BigQuery dataset to
    23  // contain your output table as described here:
    24  //
    25  //	https://cloud.google.com/bigquery/docs/tables#create-table
    26  //
    27  // To execute this pipeline locally, specify the BigQuery table for the output
    28  // with the form:
    29  //
    30  //	--output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID
    31  //
    32  // The BigQuery input table defaults to clouddataflow-readonly:samples.weather_stations
    33  // and can be overridden with {@code --input}.
    34  package main
    35  
    36  import (
    37  	"context"
    38  	"flag"
    39  	"reflect"
    40  
    41  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
    42  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/bigqueryio"
    43  	"github.com/apache/beam/sdks/v2/go/pkg/beam/log"
    44  	"github.com/apache/beam/sdks/v2/go/pkg/beam/options/gcpopts"
    45  	"github.com/apache/beam/sdks/v2/go/pkg/beam/register"
    46  	"github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats"
    47  	"github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx"
    48  )
    49  
    50  var (
    51  	input  = flag.String("input", "clouddataflow-readonly:samples.weather_stations", "BigQuery table with weather data to read from, specified as <project_id>:<dataset_id>.<table_id>")
    52  	output = flag.String("output", "", "BigQuery table to write to, specified as <project_id>:<dataset_id>.<table_id>. The dataset must already exist")
    53  )
    54  
    55  func init() {
    56  	register.Function2x1(formatFn)
    57  	register.Function2x0(extractFn)
    58  	register.Emitter1[Month]()
    59  }
    60  
    61  // Month is represented as 'int' in BQ. A Go type definition allows
    62  // us to write more type-safe transformations.
    63  type Month int
    64  
    65  // WeatherDataRow defines a BQ schema using field annotations.
    66  // It is used as a projection to extract rows from a table.
    67  type WeatherDataRow struct {
    68  	Tornado bool  `bigquery:"tornado"`
    69  	Month   Month `bigquery:"month"`
    70  }
    71  
    72  // TornadoRow defines the output BQ schema. Each row in the output dataset
    73  // conforms to this schema. A TornadoRow value represents a concrete row.
    74  type TornadoRow struct {
    75  	Month Month `bigquery:"month"`
    76  	Count int   `bigquery:"tornado_count"`
    77  }
    78  
    79  // CountTornadoes computes the number of tornadoes pr month. It takes a
    80  // PCollection<WeatherDataRow> and returns a PCollection<TornadoRow>.
    81  func CountTornadoes(s beam.Scope, rows beam.PCollection) beam.PCollection {
    82  	s = s.Scope("CountTornadoes")
    83  
    84  	// row... => month...
    85  	months := beam.ParDo(s, extractFn, rows)
    86  	// month... => <month,count>...
    87  	counted := stats.Count(s, months)
    88  	// <month,count>... => row...
    89  	return beam.ParDo(s, formatFn, counted)
    90  }
    91  
    92  // extractFn outputs the month iff a tornado happened.
    93  func extractFn(row WeatherDataRow, emit func(Month)) {
    94  	if row.Tornado {
    95  		emit(row.Month)
    96  	}
    97  }
    98  
    99  // formatFn converts a KV<Month, int> to a TornadoRow.
   100  func formatFn(month Month, count int) TornadoRow {
   101  	return TornadoRow{Month: month, Count: count}
   102  }
   103  
   104  func main() {
   105  	flag.Parse()
   106  	beam.Init()
   107  
   108  	ctx := context.Background()
   109  
   110  	if *output == "" {
   111  		log.Exit(ctx, "No output table specified. Use --output=<table>")
   112  	}
   113  	project := gcpopts.GetProject(ctx)
   114  
   115  	log.Info(ctx, "Running tornadoes")
   116  
   117  	p := beam.NewPipeline()
   118  	s := p.Root()
   119  	rows := bigqueryio.Read(s, project, *input, reflect.TypeOf(WeatherDataRow{}))
   120  	out := CountTornadoes(s, rows)
   121  	bigqueryio.Write(s, project, *output, out)
   122  
   123  	if err := beamx.Run(ctx, p); err != nil {
   124  		log.Exitf(ctx, "Failed to execute job: %v", err)
   125  	}
   126  }