github.com/apache/beam/sdks/v2@v2.48.2/go/examples/cookbook/join/join.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one or more
     2  // contributor license agreements.  See the NOTICE file distributed with
     3  // this work for additional information regarding copyright ownership.
     4  // The ASF licenses this file to You under the Apache License, Version 2.0
     5  // (the "License"); you may not use this file except in compliance with
     6  // the License.  You may obtain a copy of the License at
     7  //
     8  //    http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  
    16  package main
    17  
    18  import (
    19  	"context"
    20  	"flag"
    21  	"reflect"
    22  
    23  	"fmt"
    24  
    25  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
    26  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/bigqueryio"
    27  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio"
    28  	"github.com/apache/beam/sdks/v2/go/pkg/beam/log"
    29  	"github.com/apache/beam/sdks/v2/go/pkg/beam/options/gcpopts"
    30  	"github.com/apache/beam/sdks/v2/go/pkg/beam/register"
    31  	"github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx"
    32  )
    33  
    34  // See: https://github.com/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/cookbook/JoinExamples.java
    35  
    36  const (
    37  	gdeltEventsTable  = "clouddataflow-readonly:samples.gdelt_sample"
    38  	countryCodesTable = "gdelt-bq:full.crosswalk_geocountrycodetohuman"
    39  )
    40  
    41  var (
    42  	output = flag.String("output", "", "Output filename")
    43  )
    44  
    45  func init() {
    46  	register.Function1x2(extractEventDataFn)
    47  	register.Function1x2(extractCountryInfoFn)
    48  	register.Function2x1(formatFn)
    49  	register.Function4x0(processFn)
    50  	register.Iter1[string]()
    51  	register.Emitter2[Code, string]()
    52  }
    53  
    54  type Code string
    55  
    56  type CountryInfoRow struct {
    57  	Code Code   `bigquery:"FIPSCC"`
    58  	Name string `bigquery:"HumanName"`
    59  }
    60  
    61  type EventDataRow struct {
    62  	Code Code   `bigquery:"ActionGeo_CountryCode"`
    63  	Date int    `bigquery:"SQLDATE"`
    64  	Name string `bigquery:"Actor1Name"`
    65  	URL  string `bigquery:"SOURCEURL"`
    66  }
    67  
    68  func joinEvents(s beam.Scope, events, countries beam.PCollection) beam.PCollection {
    69  	joined := beam.CoGroupByKey(s,
    70  		beam.ParDo(s, extractEventDataFn, events),
    71  		beam.ParDo(s, extractCountryInfoFn, countries))
    72  	result := beam.ParDo(s, processFn, joined)
    73  	return beam.ParDo(s, formatFn, result)
    74  }
    75  
    76  func extractEventDataFn(row EventDataRow) (Code, string) {
    77  	return row.Code, fmt.Sprintf("Date: %v, Actor1: %v, url: %v", row.Date, row.Name, row.URL)
    78  }
    79  
    80  func extractCountryInfoFn(row CountryInfoRow) (Code, string) {
    81  	return row.Code, row.Name
    82  }
    83  
    84  func processFn(code Code, events, countries func(*string) bool, emit func(Code, string)) {
    85  	name := "none"
    86  	countries(&name) // grab first (and only) country name, if any
    87  
    88  	var event string
    89  	for events(&event) {
    90  		emit(code, fmt.Sprintf("Country name: %v, Event info: %v", name, event))
    91  	}
    92  }
    93  
    94  func formatFn(code Code, info string) string {
    95  	return fmt.Sprintf("Country code: %v, %v", code, info)
    96  }
    97  
    98  func main() {
    99  	flag.Parse()
   100  	beam.Init()
   101  
   102  	ctx := context.Background()
   103  
   104  	if *output == "" {
   105  		log.Exit(ctx, "No output filename specified. Use --output=<filename>")
   106  	}
   107  	project := gcpopts.GetProject(ctx)
   108  
   109  	log.Info(ctx, "Running join")
   110  
   111  	p := beam.NewPipeline()
   112  	s := p.Root()
   113  	events := bigqueryio.Read(s, project, gdeltEventsTable, reflect.TypeOf(EventDataRow{}))
   114  	countries := bigqueryio.Read(s, project, countryCodesTable, reflect.TypeOf(CountryInfoRow{}))
   115  	formatted := joinEvents(s, events, countries)
   116  	textio.Write(s, *output, formatted)
   117  
   118  	if err := beamx.Run(ctx, p); err != nil {
   119  		log.Exitf(ctx, "Failed to execute job: %v", err)
   120  	}
   121  }