github.com/apache/beam/sdks/v2@v2.48.2/go/examples/cookbook/join/join.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 package main 17 18 import ( 19 "context" 20 "flag" 21 "reflect" 22 23 "fmt" 24 25 "github.com/apache/beam/sdks/v2/go/pkg/beam" 26 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/bigqueryio" 27 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio" 28 "github.com/apache/beam/sdks/v2/go/pkg/beam/log" 29 "github.com/apache/beam/sdks/v2/go/pkg/beam/options/gcpopts" 30 "github.com/apache/beam/sdks/v2/go/pkg/beam/register" 31 "github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx" 32 ) 33 34 // See: https://github.com/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/cookbook/JoinExamples.java 35 36 const ( 37 gdeltEventsTable = "clouddataflow-readonly:samples.gdelt_sample" 38 countryCodesTable = "gdelt-bq:full.crosswalk_geocountrycodetohuman" 39 ) 40 41 var ( 42 output = flag.String("output", "", "Output filename") 43 ) 44 45 func init() { 46 register.Function1x2(extractEventDataFn) 47 register.Function1x2(extractCountryInfoFn) 48 register.Function2x1(formatFn) 49 register.Function4x0(processFn) 50 register.Iter1[string]() 51 register.Emitter2[Code, string]() 52 } 53 54 type Code string 55 56 type CountryInfoRow struct { 57 Code Code `bigquery:"FIPSCC"` 58 Name string `bigquery:"HumanName"` 59 } 60 61 type EventDataRow struct { 62 Code Code `bigquery:"ActionGeo_CountryCode"` 63 Date int `bigquery:"SQLDATE"` 64 Name string `bigquery:"Actor1Name"` 65 URL string `bigquery:"SOURCEURL"` 66 } 67 68 func joinEvents(s beam.Scope, events, countries beam.PCollection) beam.PCollection { 69 joined := beam.CoGroupByKey(s, 70 beam.ParDo(s, extractEventDataFn, events), 71 beam.ParDo(s, extractCountryInfoFn, countries)) 72 result := beam.ParDo(s, processFn, joined) 73 return beam.ParDo(s, formatFn, result) 74 } 75 76 func extractEventDataFn(row EventDataRow) (Code, string) { 77 return row.Code, fmt.Sprintf("Date: %v, Actor1: %v, url: %v", row.Date, row.Name, row.URL) 78 } 79 80 func extractCountryInfoFn(row CountryInfoRow) (Code, string) { 81 return row.Code, row.Name 82 } 83 84 func processFn(code Code, events, countries func(*string) bool, emit func(Code, string)) { 85 name := "none" 86 countries(&name) // grab first (and only) country name, if any 87 88 var event string 89 for events(&event) { 90 emit(code, fmt.Sprintf("Country name: %v, Event info: %v", name, event)) 91 } 92 } 93 94 func formatFn(code Code, info string) string { 95 return fmt.Sprintf("Country code: %v, %v", code, info) 96 } 97 98 func main() { 99 flag.Parse() 100 beam.Init() 101 102 ctx := context.Background() 103 104 if *output == "" { 105 log.Exit(ctx, "No output filename specified. Use --output=<filename>") 106 } 107 project := gcpopts.GetProject(ctx) 108 109 log.Info(ctx, "Running join") 110 111 p := beam.NewPipeline() 112 s := p.Root() 113 events := bigqueryio.Read(s, project, gdeltEventsTable, reflect.TypeOf(EventDataRow{})) 114 countries := bigqueryio.Read(s, project, countryCodesTable, reflect.TypeOf(CountryInfoRow{})) 115 formatted := joinEvents(s, events, countries) 116 textio.Write(s, *output, formatted) 117 118 if err := beamx.Run(ctx, p); err != nil { 119 log.Exitf(ctx, "Failed to execute job: %v", err) 120 } 121 }