github.com/apache/beam/sdks/v2@v2.48.2/go/examples/xlang/bigquery/wordcount.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // Wordcount is an example using cross-language BigQuery transforms to read and write to BigQuery. 17 // This example runs a batch pipeline that reads from the public table "shakespeare" described here: 18 // https://cloud.google.com/bigquery/public-data#sample_tables. It reads the data of word counts per 19 // different work, aggregates them to find total word counts in all works, as well as the average 20 // number of times a word appears if it appears in a work, and then writes all that data to a given 21 // output table. 22 // 23 // This example is only expected to work on Dataflow, and requires a cross-language expansion 24 // service that can expand BigQuery read and write transforms. An address to a persistent expansion 25 // service can be provided as a flag, or if none is specified then the SDK will attempt to 26 // automatically start an appropriate expansion service. 27 // 28 // # Running an Expansion Server 29 // 30 // If the automatic expansion service functionality is not available for your environment, or if 31 // you want improved performance, you will need to start a persistent expansion service. These 32 // instructions will cover running the Java SchemaIO Expansion Service, and therefore requires a JDK 33 // installation in a version supported by Beam. Depending on whether you are running this from a 34 // numbered Beam release, or a development environment, there are two sources you may use for the 35 // Expansion service. 36 // 37 // Numbered release: The expansion service jar is vendored as module 38 // org.apache.beam:beam-sdks-java-io-google-cloud-platform-expansion-service in Maven Repository. 39 // This jar can be executed directly with the following command: 40 // 41 // `java -jar <jar_name> <port_number>` 42 // 43 // Development env: This requires that the JAVA_HOME environment variable points to your JDK 44 // installation. From the root `beam/` directory of the Apache Beam repository, the jar can be 45 // built (or built and run) with the following commands: 46 // 47 // ./gradlew :sdks:java:io:google-cloud-platform:expansion-service:build 48 // ./gradlew :sdks:java:io:google-cloud-platform:expansion-service:runExpansionService -PconstructionService.port=<port_num> 49 // 50 // # Running the Example on GCP 51 // 52 // An example command for executing this pipeline on GCP is as follows: 53 // 54 // export PROJECT="$(gcloud config get-value project)" 55 // export TEMP_LOCATION="gs://MY-BUCKET/temp" 56 // export REGION="us-central1" 57 // export JOB_NAME="bigquery-wordcount-`date +%Y%m%d-%H%M%S`" 58 // export OUTPUT_TABLE="123.45.67.89:1234" 59 // export EXPANSION_ADDR="localhost:1234" 60 // export OUTPUT_TABLE="project_id:dataset_id.table_id" 61 // go run ./sdks/go/examples/kafka/types/types.go \ 62 // --runner=DataflowRunner \ 63 // --temp_location=$TEMP_LOCATION \ 64 // --staging_location=$STAGING_LOCATION \ 65 // --project=$PROJECT \ 66 // --region=$REGION \ 67 // --job_name="${JOB_NAME}" \ 68 // --bootstrap_servers=$BOOTSTRAP_SERVER \ 69 // --expansion_addr=$EXPANSION_ADDR \ 70 // --out_table=$OUTPUT_TABLE 71 // 72 // # Running the Example From a Git Clone 73 // 74 // When running on a development environment, a custom container will likely need to be provided 75 // for the cross-language SDK. First this will require building and pushing the SDK container to 76 // container repository, such as Docker Hub. 77 // 78 // export DOCKER_ROOT="Your Docker Repository Root" 79 // ./gradlew :sdks:java:container:java8:docker -Pdocker-repository-root=$DOCKER_ROOT -Pdocker-tag=latest 80 // docker push $DOCKER_ROOT/beam_java8_sdk:latest 81 // 82 // For runners in local mode, simply building the container using the default values for 83 // docker-repository-root and docker-tag will work to have it accessible locally. 84 // 85 // Additionally, you must provide the location of your custom container to the pipeline with the 86 // --sdk_harness_container_image_override flag for Java, or --environment_config flag for Go. For 87 // example: 88 // 89 // --sdk_harness_container_image_override=".*java.*,${DOCKER_ROOT}/beam_java8_sdk:latest" \ 90 // --environment_config=${DOCKER_ROOT}/beam_go_sdk:latest 91 package main 92 93 import ( 94 "context" 95 "flag" 96 "math" 97 "reflect" 98 99 "github.com/apache/beam/sdks/v2/go/pkg/beam" 100 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/xlang/bigqueryio" 101 "github.com/apache/beam/sdks/v2/go/pkg/beam/log" 102 "github.com/apache/beam/sdks/v2/go/pkg/beam/register" 103 "github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx" 104 ) 105 106 var ( 107 // Set this to the address of the expansion service to use for BigQuery read and write, or leave 108 // unspecified to attempt to automatically start an expansion service. 109 expansionAddr = flag.String("expansion_addr", "", 110 "Address of Expansion Service. If not specified, attempts to automatically start an appropriate expansion service.") 111 // Set this required option to specify where to write the output. If the table does not exist, 112 // a new one will be created. If the table already exists, elements will be appended to it. 113 outTable = flag.String("out_table", "", "Output table (required).") 114 ) 115 116 func init() { 117 register.Combiner3[WordsAccum, ShakesRow, CountsRow](&WordsCombine{}) 118 } 119 120 // ShakesRow is a struct corresponding to the schema of the Shakespeare input table. In order to 121 // be read properly, field names must match names from the BigQuery table, so some fields must 122 // include underlines. 123 type ShakesRow struct { 124 Word string `beam:"word"` 125 Word_count int64 `beam:"word_count"` 126 Corpus string `beam:"corpus"` 127 Corpus_date int64 `beam:"corpus_date"` 128 } 129 130 // CountsRow is a struct corresponding to the schema of the output table. For writes, field names 131 // are derived from the Beam schema names specified below as struct tags. 132 type CountsRow struct { 133 // Word is the word being counted. 134 Word string `beam:"word"` 135 // WordCount is the count of how many times the word appears in all works combined. 136 WordCount int64 `beam:"word_count"` 137 // CorpusCount is the count of how many works the word appears in. 138 CorpusCount int64 `beam:"corpus_count"` 139 // AvgCount is the average number of times a word appears in all works that it appears in. In 140 // other words, this is equivalent to WordCount divided by CorpusCount. 141 AvgCount float64 `beam:"avg_count"` 142 } 143 144 // WordsAccum is an accumulator for combining Shakespeare word counts in order to get averages of 145 // word counts. 146 type WordsAccum struct { 147 // Word is the word being counted. 148 Word string 149 // Count is the number of times this word has appeared, or in other words the number of corpuses 150 // it appears in (assuming that the input never repeats a word and corpus pair. 151 Count int64 152 // Sum is the sum of word counts from inputs. 153 Sum int64 154 } 155 156 // WordsCombine is a CombineFn that adds up word counts and calculates average number of counts. 157 type WordsCombine struct{} 158 159 // CreateAccumulator creates a default WordsAccum. 160 func (fn *WordsCombine) CreateAccumulator() WordsAccum { 161 return WordsAccum{} 162 } 163 164 // AddInput sums up word counts and increments the corpus count. 165 func (fn *WordsCombine) AddInput(a WordsAccum, row ShakesRow) WordsAccum { 166 a.Word = row.Word 167 a.Count += 1 168 a.Sum += row.Word_count 169 return a 170 } 171 172 // MergeAccumulators sums up the various counts being accumulated. 173 func (fn *WordsCombine) MergeAccumulators(a, v WordsAccum) WordsAccum { 174 return WordsAccum{Word: a.Word, Count: a.Count + v.Count, Sum: a.Sum + v.Sum} 175 } 176 177 // ExtractOutput calculates the average and fills out the output rows. 178 func (fn *WordsCombine) ExtractOutput(a WordsAccum) CountsRow { 179 row := CountsRow{ 180 Word: a.Word, 181 WordCount: a.Sum, 182 CorpusCount: a.Count, 183 } 184 if a.Count == 0 { 185 row.AvgCount = math.NaN() 186 } else { 187 row.AvgCount = float64(a.Sum) / float64(a.Count) 188 } 189 return row 190 } 191 192 func main() { 193 flag.Parse() 194 beam.Init() 195 196 p := beam.NewPipeline() 197 s := p.Root() 198 199 // Read from the public BigQuery table. 200 inType := reflect.TypeOf((*ShakesRow)(nil)).Elem() 201 rows := bigqueryio.Read(s, inType, 202 bigqueryio.FromTable("bigquery-public-data:samples.shakespeare"), 203 bigqueryio.ReadExpansionAddr(*expansionAddr)) 204 205 // Combine the data per word. 206 keyed := beam.ParDo(s, func(elm ShakesRow) (string, ShakesRow) { 207 return elm.Word, elm 208 }, rows) 209 counts := beam.CombinePerKey(s, &WordsCombine{}, keyed) 210 countVals := beam.DropKey(s, counts) 211 212 // Write the data to the given BigQuery table destination, creating the table if needed. 213 bigqueryio.Write(s, *outTable, countVals, 214 bigqueryio.CreateDisposition(bigqueryio.CreateIfNeeded), 215 bigqueryio.WriteExpansionAddr(*expansionAddr)) 216 217 ctx := context.Background() 218 if err := beamx.Run(ctx, p); err != nil { 219 log.Fatalf(ctx, "Failed to execute job: %v", err) 220 } 221 }