github.com/apache/beam/sdks/v2@v2.48.2/go/examples/pingpong/pingpong.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 package main 17 18 import ( 19 "context" 20 "errors" 21 "flag" 22 "fmt" 23 "os" 24 "regexp" 25 26 "github.com/apache/beam/sdks/v2/go/pkg/beam" 27 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio" 28 "github.com/apache/beam/sdks/v2/go/pkg/beam/log" 29 "github.com/apache/beam/sdks/v2/go/pkg/beam/register" 30 "github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx" 31 ) 32 33 var ( 34 input = flag.String("input", os.ExpandEnv("$GOPATH/src/github.com/apache/beam/sdks/go/data/haiku/old_pond.txt"), "Files to read.") 35 output = flag.String("output", "/tmp/pingpong/out.", "Prefix of output.") 36 ) 37 38 func init() { 39 register.Function4x1(multiFn) 40 register.Function3x1(subsetFn) 41 register.Function2x0(extractFn) 42 43 register.Emitter1[string]() 44 register.Iter1[string]() 45 } 46 47 // stitch constructs two composite PTransforms that provide input to each other. It 48 // is a (deliberately) complex DAG to show what kind of structures are possible. 49 func stitch(s beam.Scope, words beam.PCollection) (beam.PCollection, beam.PCollection) { 50 ping := s.Scope("ping") 51 pong := ping // s.Scope("pong") 52 53 // NOTE(herohde) 2/23/2017: Dataflow does not allow cyclic composite structures. 54 55 small1, big1 := beam.ParDo2(ping, multiFn, words, beam.SideInput{Input: words}) // self-sample (ping) 56 small2, big2 := beam.ParDo2(pong, multiFn, words, beam.SideInput{Input: big1}) // big-sample (pong). More words are small. 57 _, big3 := beam.ParDo2(ping, multiFn, big2, beam.SideInput{Input: small1}) // small-sample big (ping). All words are big. 58 small4, _ := beam.ParDo2(pong, multiFn, small2, beam.SideInput{Input: big3}) // big-sample small (pong). All words are small. 59 60 return small4, big3 61 } 62 63 // Slice side input. 64 65 func multiFn(word string, sample []string, small, big func(string)) error { 66 // TODO: side input processing into start bundle, once supported. 67 68 count := 0 69 size := 0 70 for _, w := range sample { 71 count++ 72 size += len(w) 73 } 74 if count == 0 { 75 return errors.New("empty sample") 76 } 77 avg := size / count 78 79 if len(word) < avg { 80 small(word) 81 } else { 82 big(word) 83 } 84 return nil 85 } 86 87 func subset(s beam.Scope, a, b beam.PCollection) { 88 beam.ParDo0(s, subsetFn, beam.Impulse(s), beam.SideInput{Input: a}, beam.SideInput{Input: b}) 89 } 90 91 func subsetFn(_ []byte, a, b func(*string) bool) error { 92 larger := make(map[string]bool) 93 var elm string 94 for b(&elm) { 95 larger[elm] = true 96 } 97 for a(&elm) { 98 if !larger[elm] { 99 return fmt.Errorf("extra element: %v", elm) 100 } 101 } 102 return nil 103 } 104 105 var wordRE = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`) 106 107 func extractFn(line string, emit func(string)) { 108 for _, word := range wordRE.FindAllString(line, -1) { 109 emit(word) 110 } 111 } 112 113 func main() { 114 flag.Parse() 115 beam.Init() 116 117 ctx := context.Background() 118 119 log.Info(ctx, "Running pingpong") 120 121 // PingPong constructs a convoluted pipeline with two "cyclic" composites. 122 p := beam.NewPipeline() 123 s := p.Root() 124 125 lines := textio.Read(s, *input) 126 words := beam.ParDo(s, extractFn, lines) 127 128 // Run baseline and stitch; then compare them. 129 small, big := beam.ParDo2(s, multiFn, words, beam.SideInput{Input: words}) 130 small2, big2 := stitch(s, words) 131 132 subset(s, small, small2) 133 subset(s, big2, big) 134 135 textio.Write(s, *output+"small.txt", small2) 136 textio.Write(s, *output+"big.txt", big2) 137 138 if err := beamx.Run(ctx, p); err != nil { 139 log.Exitf(ctx, "Failed to execute job: %v", err) 140 } 141 }