github.com/apache/beam/sdks/v2@v2.48.2/go/examples/pingpong/pingpong.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one or more
     2  // contributor license agreements.  See the NOTICE file distributed with
     3  // this work for additional information regarding copyright ownership.
     4  // The ASF licenses this file to You under the Apache License, Version 2.0
     5  // (the "License"); you may not use this file except in compliance with
     6  // the License.  You may obtain a copy of the License at
     7  //
     8  //    http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  
    16  package main
    17  
    18  import (
    19  	"context"
    20  	"errors"
    21  	"flag"
    22  	"fmt"
    23  	"os"
    24  	"regexp"
    25  
    26  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
    27  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio"
    28  	"github.com/apache/beam/sdks/v2/go/pkg/beam/log"
    29  	"github.com/apache/beam/sdks/v2/go/pkg/beam/register"
    30  	"github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx"
    31  )
    32  
    33  var (
    34  	input  = flag.String("input", os.ExpandEnv("$GOPATH/src/github.com/apache/beam/sdks/go/data/haiku/old_pond.txt"), "Files to read.")
    35  	output = flag.String("output", "/tmp/pingpong/out.", "Prefix of output.")
    36  )
    37  
    38  func init() {
    39  	register.Function4x1(multiFn)
    40  	register.Function3x1(subsetFn)
    41  	register.Function2x0(extractFn)
    42  
    43  	register.Emitter1[string]()
    44  	register.Iter1[string]()
    45  }
    46  
    47  // stitch constructs two composite PTransforms that provide input to each other. It
    48  // is a (deliberately) complex DAG to show what kind of structures are possible.
    49  func stitch(s beam.Scope, words beam.PCollection) (beam.PCollection, beam.PCollection) {
    50  	ping := s.Scope("ping")
    51  	pong := ping // s.Scope("pong")
    52  
    53  	// NOTE(herohde) 2/23/2017: Dataflow does not allow cyclic composite structures.
    54  
    55  	small1, big1 := beam.ParDo2(ping, multiFn, words, beam.SideInput{Input: words}) // self-sample (ping)
    56  	small2, big2 := beam.ParDo2(pong, multiFn, words, beam.SideInput{Input: big1})  // big-sample  (pong). More words are small.
    57  	_, big3 := beam.ParDo2(ping, multiFn, big2, beam.SideInput{Input: small1})      // small-sample big (ping). All words are big.
    58  	small4, _ := beam.ParDo2(pong, multiFn, small2, beam.SideInput{Input: big3})    // big-sample small (pong). All words are small.
    59  
    60  	return small4, big3
    61  }
    62  
    63  // Slice side input.
    64  
    65  func multiFn(word string, sample []string, small, big func(string)) error {
    66  	// TODO: side input processing into start bundle, once supported.
    67  
    68  	count := 0
    69  	size := 0
    70  	for _, w := range sample {
    71  		count++
    72  		size += len(w)
    73  	}
    74  	if count == 0 {
    75  		return errors.New("empty sample")
    76  	}
    77  	avg := size / count
    78  
    79  	if len(word) < avg {
    80  		small(word)
    81  	} else {
    82  		big(word)
    83  	}
    84  	return nil
    85  }
    86  
    87  func subset(s beam.Scope, a, b beam.PCollection) {
    88  	beam.ParDo0(s, subsetFn, beam.Impulse(s), beam.SideInput{Input: a}, beam.SideInput{Input: b})
    89  }
    90  
    91  func subsetFn(_ []byte, a, b func(*string) bool) error {
    92  	larger := make(map[string]bool)
    93  	var elm string
    94  	for b(&elm) {
    95  		larger[elm] = true
    96  	}
    97  	for a(&elm) {
    98  		if !larger[elm] {
    99  			return fmt.Errorf("extra element: %v", elm)
   100  		}
   101  	}
   102  	return nil
   103  }
   104  
   105  var wordRE = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`)
   106  
   107  func extractFn(line string, emit func(string)) {
   108  	for _, word := range wordRE.FindAllString(line, -1) {
   109  		emit(word)
   110  	}
   111  }
   112  
   113  func main() {
   114  	flag.Parse()
   115  	beam.Init()
   116  
   117  	ctx := context.Background()
   118  
   119  	log.Info(ctx, "Running pingpong")
   120  
   121  	// PingPong constructs a convoluted pipeline with two "cyclic" composites.
   122  	p := beam.NewPipeline()
   123  	s := p.Root()
   124  
   125  	lines := textio.Read(s, *input)
   126  	words := beam.ParDo(s, extractFn, lines)
   127  
   128  	// Run baseline and stitch; then compare them.
   129  	small, big := beam.ParDo2(s, multiFn, words, beam.SideInput{Input: words})
   130  	small2, big2 := stitch(s, words)
   131  
   132  	subset(s, small, small2)
   133  	subset(s, big2, big)
   134  
   135  	textio.Write(s, *output+"small.txt", small2)
   136  	textio.Write(s, *output+"big.txt", big2)
   137  
   138  	if err := beamx.Run(ctx, p); err != nil {
   139  		log.Exitf(ctx, "Failed to execute job: %v", err)
   140  	}
   141  }