github.com/apache/beam/sdks/v2@v2.48.2/go/examples/contains/contains.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one or more
     2  // contributor license agreements.  See the NOTICE file distributed with
     3  // this work for additional information regarding copyright ownership.
     4  // The ASF licenses this file to You under the Apache License, Version 2.0
     5  // (the "License"); you may not use this file except in compliance with
     6  // the License.  You may obtain a copy of the License at
     7  //
     8  //    http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  
    16  package main
    17  
    18  // beam-playground:
    19  //   name: Contains
    20  //   description: An example counts received substring in Shakespeare's works.
    21  //   multifile: false
    22  //   pipeline_options: --search king
    23  //   context_line: 51
    24  //   categories:
    25  //     - Filtering
    26  //     - Options
    27  //     - Debugging
    28  //   complexity: MEDIUM
    29  //   tags:
    30  //     - count
    31  //     - io
    32  //     - strings
    33  
    34  import (
    35  	"context"
    36  	"flag"
    37  	"fmt"
    38  	"regexp"
    39  	"strings"
    40  
    41  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
    42  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio"
    43  	"github.com/apache/beam/sdks/v2/go/pkg/beam/log"
    44  	"github.com/apache/beam/sdks/v2/go/pkg/beam/register"
    45  	"github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats"
    46  	"github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx"
    47  	"github.com/apache/beam/sdks/v2/go/pkg/beam/x/debug"
    48  )
    49  
    50  // Options used purely at pipeline construction-time can just be flags.
    51  var (
    52  	input  = flag.String("input", "gs://apache-beam-samples/shakespeare/kinglear.txt", "File(s) to read.")
    53  	search = flag.String("search", "", "Only return words that contain this substring.")
    54  )
    55  
    56  func init() {
    57  	register.Function2x0(extractFn)
    58  	register.Function2x1(formatFn)
    59  	register.DoFn2x0[string, func(string)](&includeFn{})
    60  	register.Emitter1[string]()
    61  }
    62  
    63  // FilterWords returns PCollection<KV<word,count>> with (up to) 10 matching words.
    64  func FilterWords(s beam.Scope, lines beam.PCollection) beam.PCollection {
    65  	s = s.Scope("FilterWords")
    66  	words := beam.ParDo(s, extractFn, lines)
    67  	filtered := beam.ParDo(s, &includeFn{Search: *search}, words)
    68  	counted := stats.Count(s, filtered)
    69  	return debug.Head(s, counted, 10)
    70  }
    71  
    72  var wordRE = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`)
    73  
    74  func extractFn(line string, emit func(string)) {
    75  	for _, w := range wordRE.FindAllString(line, -1) {
    76  		emit(w)
    77  	}
    78  }
    79  
    80  // includeFn outputs (word) iif the word contains substring Search.
    81  type includeFn struct {
    82  	Search string `json:"search"`
    83  }
    84  
    85  func (f *includeFn) ProcessElement(s string, emit func(string)) {
    86  	if strings.Contains(s, f.Search) {
    87  		emit(s)
    88  	}
    89  }
    90  
    91  func formatFn(w string, c int) string {
    92  	return fmt.Sprintf("%s: %v", w, c)
    93  }
    94  
    95  func main() {
    96  	flag.Parse()
    97  	beam.Init()
    98  
    99  	ctx := context.Background()
   100  
   101  	if *search == "" {
   102  		log.Exit(ctx, "No search string provided. Use --search=foo")
   103  	}
   104  
   105  	log.Info(ctx, "Running contains")
   106  
   107  	// Construct a pipeline that only keeps 10 words that contain the provided search string.
   108  	p := beam.NewPipeline()
   109  	s := p.Root()
   110  	lines := textio.Read(s, *input)
   111  	filtered := FilterWords(s, lines)
   112  	formatted := beam.ParDo(s, formatFn, filtered)
   113  	debug.Print(s, formatted)
   114  
   115  	if err := beamx.Run(ctx, p); err != nil {
   116  		log.Exitf(ctx, "Failed to execute job: %v", err)
   117  	}
   118  }