github.com/apache/beam/sdks/v2@v2.48.2/go/examples/cookbook/tornadoes/tornadoes.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // tornadoes is an example that reads the public samples of weather data from 17 // BigQuery, counts the number of tornadoes that occur in each month, and 18 // writes the results to BigQuery. 19 // 20 // Concepts: Reading/writing BigQuery; Using Go types for better type-safety. 21 // 22 // Note: Before running this example, you must create a BigQuery dataset to 23 // contain your output table as described here: 24 // 25 // https://cloud.google.com/bigquery/docs/tables#create-table 26 // 27 // To execute this pipeline locally, specify the BigQuery table for the output 28 // with the form: 29 // 30 // --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID 31 // 32 // The BigQuery input table defaults to clouddataflow-readonly:samples.weather_stations 33 // and can be overridden with {@code --input}. 34 package main 35 36 import ( 37 "context" 38 "flag" 39 "reflect" 40 41 "github.com/apache/beam/sdks/v2/go/pkg/beam" 42 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/bigqueryio" 43 "github.com/apache/beam/sdks/v2/go/pkg/beam/log" 44 "github.com/apache/beam/sdks/v2/go/pkg/beam/options/gcpopts" 45 "github.com/apache/beam/sdks/v2/go/pkg/beam/register" 46 "github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats" 47 "github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx" 48 ) 49 50 var ( 51 input = flag.String("input", "clouddataflow-readonly:samples.weather_stations", "BigQuery table with weather data to read from, specified as <project_id>:<dataset_id>.<table_id>") 52 output = flag.String("output", "", "BigQuery table to write to, specified as <project_id>:<dataset_id>.<table_id>. The dataset must already exist") 53 ) 54 55 func init() { 56 register.Function2x1(formatFn) 57 register.Function2x0(extractFn) 58 register.Emitter1[Month]() 59 } 60 61 // Month is represented as 'int' in BQ. A Go type definition allows 62 // us to write more type-safe transformations. 63 type Month int 64 65 // WeatherDataRow defines a BQ schema using field annotations. 66 // It is used as a projection to extract rows from a table. 67 type WeatherDataRow struct { 68 Tornado bool `bigquery:"tornado"` 69 Month Month `bigquery:"month"` 70 } 71 72 // TornadoRow defines the output BQ schema. Each row in the output dataset 73 // conforms to this schema. A TornadoRow value represents a concrete row. 74 type TornadoRow struct { 75 Month Month `bigquery:"month"` 76 Count int `bigquery:"tornado_count"` 77 } 78 79 // CountTornadoes computes the number of tornadoes pr month. It takes a 80 // PCollection<WeatherDataRow> and returns a PCollection<TornadoRow>. 81 func CountTornadoes(s beam.Scope, rows beam.PCollection) beam.PCollection { 82 s = s.Scope("CountTornadoes") 83 84 // row... => month... 85 months := beam.ParDo(s, extractFn, rows) 86 // month... => <month,count>... 87 counted := stats.Count(s, months) 88 // <month,count>... => row... 89 return beam.ParDo(s, formatFn, counted) 90 } 91 92 // extractFn outputs the month iff a tornado happened. 93 func extractFn(row WeatherDataRow, emit func(Month)) { 94 if row.Tornado { 95 emit(row.Month) 96 } 97 } 98 99 // formatFn converts a KV<Month, int> to a TornadoRow. 100 func formatFn(month Month, count int) TornadoRow { 101 return TornadoRow{Month: month, Count: count} 102 } 103 104 func main() { 105 flag.Parse() 106 beam.Init() 107 108 ctx := context.Background() 109 110 if *output == "" { 111 log.Exit(ctx, "No output table specified. Use --output=<table>") 112 } 113 project := gcpopts.GetProject(ctx) 114 115 log.Info(ctx, "Running tornadoes") 116 117 p := beam.NewPipeline() 118 s := p.Root() 119 rows := bigqueryio.Read(s, project, *input, reflect.TypeOf(WeatherDataRow{})) 120 out := CountTornadoes(s, rows) 121 bigqueryio.Write(s, project, *output, out) 122 123 if err := beamx.Run(ctx, p); err != nil { 124 log.Exitf(ctx, "Failed to execute job: %v", err) 125 } 126 }