github.com/apache/beam/sdks/v2@v2.48.2/go/examples/xlang/bigquery/wordcount.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one or more
     2  // contributor license agreements.  See the NOTICE file distributed with
     3  // this work for additional information regarding copyright ownership.
     4  // The ASF licenses this file to You under the Apache License, Version 2.0
     5  // (the "License"); you may not use this file except in compliance with
     6  // the License.  You may obtain a copy of the License at
     7  //
     8  //    http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  
    16  // Wordcount is an example using cross-language BigQuery transforms to read and write to BigQuery.
    17  // This example runs a batch pipeline that reads from the public table "shakespeare" described here:
    18  // https://cloud.google.com/bigquery/public-data#sample_tables. It reads the data of word counts per
    19  // different work, aggregates them to find total word counts in all works, as well as the average
    20  // number of times a word appears if it appears in a work, and then writes all that data to a given
    21  // output table.
    22  //
    23  // This example is only expected to work on Dataflow, and requires a cross-language expansion
    24  // service that can expand BigQuery read and write transforms. An address to a persistent expansion
    25  // service can be provided as a flag, or if none is specified then the SDK will attempt to
    26  // automatically start an appropriate expansion service.
    27  //
    28  // # Running an Expansion Server
    29  //
    30  // If the automatic expansion service functionality is not available for your environment, or if
    31  // you want improved performance, you will need to start a persistent expansion service. These
    32  // instructions will cover running the Java SchemaIO Expansion Service, and therefore requires a JDK
    33  // installation in a version supported by Beam. Depending on whether you are running this from a
    34  // numbered Beam release, or a development environment, there are two sources you may use for the
    35  // Expansion service.
    36  //
    37  // Numbered release: The expansion service jar is vendored as module
    38  // org.apache.beam:beam-sdks-java-io-google-cloud-platform-expansion-service in Maven Repository.
    39  // This jar can be executed directly with the following command:
    40  //
    41  //	`java -jar <jar_name> <port_number>`
    42  //
    43  // Development env: This requires that the JAVA_HOME environment variable points to your JDK
    44  // installation. From the root `beam/` directory of the Apache Beam repository, the jar can be
    45  // built (or built and run) with the following commands:
    46  //
    47  //	./gradlew :sdks:java:io:google-cloud-platform:expansion-service:build
    48  //	./gradlew :sdks:java:io:google-cloud-platform:expansion-service:runExpansionService -PconstructionService.port=<port_num>
    49  //
    50  // # Running the Example on GCP
    51  //
    52  // An example command for executing this pipeline on GCP is as follows:
    53  //
    54  //	export PROJECT="$(gcloud config get-value project)"
    55  //	export TEMP_LOCATION="gs://MY-BUCKET/temp"
    56  //	export REGION="us-central1"
    57  //	export JOB_NAME="bigquery-wordcount-`date +%Y%m%d-%H%M%S`"
    58  //	export OUTPUT_TABLE="123.45.67.89:1234"
    59  //	export EXPANSION_ADDR="localhost:1234"
    60  //	export OUTPUT_TABLE="project_id:dataset_id.table_id"
    61  //	go run ./sdks/go/examples/kafka/types/types.go \
    62  //	  --runner=DataflowRunner \
    63  //	  --temp_location=$TEMP_LOCATION \
    64  //	  --staging_location=$STAGING_LOCATION \
    65  //	  --project=$PROJECT \
    66  //	  --region=$REGION \
    67  //	  --job_name="${JOB_NAME}" \
    68  //	  --bootstrap_servers=$BOOTSTRAP_SERVER \
    69  //	  --expansion_addr=$EXPANSION_ADDR \
    70  //	  --out_table=$OUTPUT_TABLE
    71  //
    72  // # Running the Example From a Git Clone
    73  //
    74  // When running on a development environment, a custom container will likely need to be provided
    75  // for the cross-language SDK. First this will require building and pushing the SDK container to
    76  // container repository, such as Docker Hub.
    77  //
    78  //	export DOCKER_ROOT="Your Docker Repository Root"
    79  //	./gradlew :sdks:java:container:java8:docker -Pdocker-repository-root=$DOCKER_ROOT -Pdocker-tag=latest
    80  //	docker push $DOCKER_ROOT/beam_java8_sdk:latest
    81  //
    82  // For runners in local mode, simply building the container using the default values for
    83  // docker-repository-root and docker-tag will work to have it accessible locally.
    84  //
    85  // Additionally, you must provide the location of your custom container to the pipeline with the
    86  // --sdk_harness_container_image_override flag for Java, or --environment_config flag for Go. For
    87  // example:
    88  //
    89  //	--sdk_harness_container_image_override=".*java.*,${DOCKER_ROOT}/beam_java8_sdk:latest" \
    90  //	--environment_config=${DOCKER_ROOT}/beam_go_sdk:latest
    91  package main
    92  
    93  import (
    94  	"context"
    95  	"flag"
    96  	"math"
    97  	"reflect"
    98  
    99  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
   100  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/xlang/bigqueryio"
   101  	"github.com/apache/beam/sdks/v2/go/pkg/beam/log"
   102  	"github.com/apache/beam/sdks/v2/go/pkg/beam/register"
   103  	"github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx"
   104  )
   105  
   106  var (
   107  	// Set this to the address of the expansion service to use for BigQuery read and write, or leave
   108  	// unspecified to attempt to automatically start an expansion service.
   109  	expansionAddr = flag.String("expansion_addr", "",
   110  		"Address of Expansion Service. If not specified, attempts to automatically start an appropriate expansion service.")
   111  	// Set this required option to specify where to write the output. If the table does not exist,
   112  	// a new one will be created. If the table already exists, elements will be appended to it.
   113  	outTable = flag.String("out_table", "", "Output table (required).")
   114  )
   115  
   116  func init() {
   117  	register.Combiner3[WordsAccum, ShakesRow, CountsRow](&WordsCombine{})
   118  }
   119  
   120  // ShakesRow is a struct corresponding to the schema of the Shakespeare input table. In order to
   121  // be read properly, field names must match names from the BigQuery table, so some fields must
   122  // include underlines.
   123  type ShakesRow struct {
   124  	Word        string `beam:"word"`
   125  	Word_count  int64  `beam:"word_count"`
   126  	Corpus      string `beam:"corpus"`
   127  	Corpus_date int64  `beam:"corpus_date"`
   128  }
   129  
   130  // CountsRow is a struct corresponding to the schema of the output table. For writes, field names
   131  // are derived from the Beam schema names specified below as struct tags.
   132  type CountsRow struct {
   133  	// Word is the word being counted.
   134  	Word string `beam:"word"`
   135  	// WordCount is the count of how many times the word appears in all works combined.
   136  	WordCount int64 `beam:"word_count"`
   137  	// CorpusCount is the count of how many works the word appears in.
   138  	CorpusCount int64 `beam:"corpus_count"`
   139  	// AvgCount is the average number of times a word appears in all works that it appears in. In
   140  	// other words, this is equivalent to WordCount divided by CorpusCount.
   141  	AvgCount float64 `beam:"avg_count"`
   142  }
   143  
   144  // WordsAccum is an accumulator for combining Shakespeare word counts in order to get averages of
   145  // word counts.
   146  type WordsAccum struct {
   147  	// Word is the word being counted.
   148  	Word string
   149  	// Count is the number of times this word has appeared, or in other words the number of corpuses
   150  	// it appears in (assuming that the input never repeats a word and corpus pair.
   151  	Count int64
   152  	// Sum is the sum of word counts from inputs.
   153  	Sum int64
   154  }
   155  
   156  // WordsCombine is a CombineFn that adds up word counts and calculates average number of counts.
   157  type WordsCombine struct{}
   158  
   159  // CreateAccumulator creates a default WordsAccum.
   160  func (fn *WordsCombine) CreateAccumulator() WordsAccum {
   161  	return WordsAccum{}
   162  }
   163  
   164  // AddInput sums up word counts and increments the corpus count.
   165  func (fn *WordsCombine) AddInput(a WordsAccum, row ShakesRow) WordsAccum {
   166  	a.Word = row.Word
   167  	a.Count += 1
   168  	a.Sum += row.Word_count
   169  	return a
   170  }
   171  
   172  // MergeAccumulators sums up the various counts being accumulated.
   173  func (fn *WordsCombine) MergeAccumulators(a, v WordsAccum) WordsAccum {
   174  	return WordsAccum{Word: a.Word, Count: a.Count + v.Count, Sum: a.Sum + v.Sum}
   175  }
   176  
   177  // ExtractOutput calculates the average and fills out the output rows.
   178  func (fn *WordsCombine) ExtractOutput(a WordsAccum) CountsRow {
   179  	row := CountsRow{
   180  		Word:        a.Word,
   181  		WordCount:   a.Sum,
   182  		CorpusCount: a.Count,
   183  	}
   184  	if a.Count == 0 {
   185  		row.AvgCount = math.NaN()
   186  	} else {
   187  		row.AvgCount = float64(a.Sum) / float64(a.Count)
   188  	}
   189  	return row
   190  }
   191  
   192  func main() {
   193  	flag.Parse()
   194  	beam.Init()
   195  
   196  	p := beam.NewPipeline()
   197  	s := p.Root()
   198  
   199  	// Read from the public BigQuery table.
   200  	inType := reflect.TypeOf((*ShakesRow)(nil)).Elem()
   201  	rows := bigqueryio.Read(s, inType,
   202  		bigqueryio.FromTable("bigquery-public-data:samples.shakespeare"),
   203  		bigqueryio.ReadExpansionAddr(*expansionAddr))
   204  
   205  	// Combine the data per word.
   206  	keyed := beam.ParDo(s, func(elm ShakesRow) (string, ShakesRow) {
   207  		return elm.Word, elm
   208  	}, rows)
   209  	counts := beam.CombinePerKey(s, &WordsCombine{}, keyed)
   210  	countVals := beam.DropKey(s, counts)
   211  
   212  	// Write the data to the given BigQuery table destination, creating the table if needed.
   213  	bigqueryio.Write(s, *outTable, countVals,
   214  		bigqueryio.CreateDisposition(bigqueryio.CreateIfNeeded),
   215  		bigqueryio.WriteExpansionAddr(*expansionAddr))
   216  
   217  	ctx := context.Background()
   218  	if err := beamx.Run(ctx, p); err != nil {
   219  		log.Fatalf(ctx, "Failed to execute job: %v", err)
   220  	}
   221  }