github.com/apache/beam/sdks/v2@v2.48.2/go/examples/cookbook/combine/combine.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 package main 17 18 // See: https://github.com/apache/beam/blob/master/examples/java/src/main/java/org/apache/beam/examples/cookbook/CombinePerKeyExamples.java 19 20 import ( 21 "context" 22 "flag" 23 "fmt" 24 "reflect" 25 26 "github.com/apache/beam/sdks/v2/go/pkg/beam" 27 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/bigqueryio" 28 "github.com/apache/beam/sdks/v2/go/pkg/beam/log" 29 "github.com/apache/beam/sdks/v2/go/pkg/beam/options/gcpopts" 30 "github.com/apache/beam/sdks/v2/go/pkg/beam/register" 31 "github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx" 32 ) 33 34 var ( 35 input = flag.String("input", "publicdata:samples.shakespeare", "Shakespeare plays BQ table.") 36 output = flag.String("output", "", "Output BQ table.") 37 38 minLength = flag.Int("min_length", 9, "Minimum word length") 39 small_words = beam.NewCounter("extract", "small_words") 40 ) 41 42 func init() { 43 register.Function2x1(concatFn) 44 register.Function2x1(formatFn) 45 register.DoFn3x0[context.Context, WordRow, func(string, string)](&extractFn{}) 46 register.Emitter2[string, string]() 47 } 48 49 type WordRow struct { 50 Corpus string `bigquery:"corpus"` 51 Word string `bigquery:"word"` 52 } 53 54 type PlaysRow struct { 55 Word string `bigquery:"word"` 56 Plays string `bigquery:"plays"` 57 } 58 59 // PlaysForWords generates a string containing the list of play names 60 // in which that word appears. It takes a PCollection<WordRow> and 61 // returns a PCollection<PlaysRow>. 62 func PlaysForWords(s beam.Scope, rows beam.PCollection) beam.PCollection { 63 s = s.Scope("PlaysForWords") 64 65 words := beam.ParDo(s, &extractFn{MinLength: *minLength}, rows) 66 keyed := beam.CombinePerKey(s, concatFn, words) 67 return beam.ParDo(s, formatFn, keyed) 68 } 69 70 // extractFn outputs (word, play) iff the word is longer than the minimum length. 71 type extractFn struct { 72 MinLength int `json:"min_length"` 73 } 74 75 func (f *extractFn) ProcessElement(ctx context.Context, row WordRow, emit func(string, string)) { 76 if len(row.Word) >= f.MinLength { 77 emit(row.Word, row.Corpus) 78 } else { 79 small_words.Inc(ctx, 1) 80 } 81 } 82 83 // TODO(herohde) 7/14/2017: the choice of a string (instead of []string) for the 84 // output makes the combiner simpler. Seems hokey. 85 86 func concatFn(a, b string) string { 87 return fmt.Sprintf("%v,%v", a, b) 88 } 89 90 func formatFn(word, plays string) PlaysRow { 91 return PlaysRow{Word: word, Plays: plays} 92 } 93 94 func main() { 95 flag.Parse() 96 beam.Init() 97 98 ctx := context.Background() 99 100 if *output == "" { 101 log.Exit(ctx, "No output table specified. Use --output=<table>") 102 } 103 project := gcpopts.GetProject(ctx) 104 105 log.Info(ctx, "Running combine") 106 107 p := beam.NewPipeline() 108 s := p.Root() 109 rows := bigqueryio.Read(s, project, *input, reflect.TypeOf(WordRow{})) 110 out := PlaysForWords(s, rows) 111 bigqueryio.Write(s, project, *output, out) 112 113 if err := beamx.Run(ctx, p); err != nil { 114 log.Exitf(ctx, "Failed to execute job: %v", err) 115 } 116 }