github.com/apache/beam/sdks/v2@v2.48.2/go/examples/debugging_wordcount/debugging_wordcount.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one or more
     2  // contributor license agreements.  See the NOTICE file distributed with
     3  // this work for additional information regarding copyright ownership.
     4  // The ASF licenses this file to You under the Apache License, Version 2.0
     5  // (the "License"); you may not use this file except in compliance with
     6  // the License.  You may obtain a copy of the License at
     7  //
     8  //    http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  
    16  // debugging_wordcount is an example that verifies word counts in Shakespeare
    17  // and includes Beam best practices.
    18  //
    19  // This example, debugging_wordcount, is the third in a series of four
    20  // successively more detailed 'word count' examples. You may first want to
    21  // take a look at minimal_wordcount and wordcount. After you've looked at
    22  // this example, then see the windowed_wordcount pipeline, for introduction
    23  // of additional concepts.
    24  //
    25  // Basic concepts, also in the minimal_wordcount and wordcount examples:
    26  // Reading text files; counting a PCollection; executing a Pipeline both locally
    27  // and using a selected runner; defining DoFns.
    28  //
    29  // New Concepts:
    30  //
    31  //  1. Using the richer struct DoFn form and accessing optional arguments.
    32  //  2. Logging using the Beam log package, even in a distributed environment
    33  //  3. Testing your Pipeline via passert
    34  //
    35  // To change the runner, specify:
    36  //
    37  //	--runner=YOUR_SELECTED_RUNNER
    38  //
    39  // The input file defaults to a public data set containing the text of King
    40  // Lear, by William Shakespeare. You can override it and choose your own input
    41  // with --input.
    42  package main
    43  
    44  // beam-playground:
    45  //   name: DebuggingWordCount
    46  //   description: An example that counts words in Shakespeare's works includes regex filter("Flourish|stomach").
    47  //   multifile: false
    48  //   pipeline_options: --output output.txt
    49  //   context_line: 83
    50  //   categories:
    51  //     - Options
    52  //     - Filtering
    53  //     - Debugging
    54  //     - Quickstart
    55  //   complexity: MEDIUM
    56  //   tags:
    57  //     - count
    58  //     - io
    59  //     - strings
    60  
    61  import (
    62  	"context"
    63  	"flag"
    64  	"fmt"
    65  	"regexp"
    66  
    67  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
    68  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio"
    69  	"github.com/apache/beam/sdks/v2/go/pkg/beam/log"
    70  	"github.com/apache/beam/sdks/v2/go/pkg/beam/register"
    71  	"github.com/apache/beam/sdks/v2/go/pkg/beam/testing/passert"
    72  	"github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats"
    73  	"github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx"
    74  )
    75  
    76  // TODO(herohde) 10/16/2017: support metrics and log level cutoff.
    77  
    78  var (
    79  	input  = flag.String("input", "gs://apache-beam-samples/shakespeare/kinglear.txt", "File(s) to read.")
    80  	filter = flag.String("filter", "Flourish|stomach", "Regex filter pattern to use. Only words matching this pattern will be included.")
    81  	output = flag.String("output", "", "Output file (required).")
    82  )
    83  
    84  // Concept #1: a DoFn can also be a struct with methods for setup/teardown and
    85  // element/bundle processing. It also allows configuration values to be made
    86  // available at runtime.
    87  
    88  func init() {
    89  	// register.DoFnXxY registers a struct DoFn so that it can be correctly serialized and does some optimization
    90  	// to avoid runtime reflection. Since addTimestampFn has 4 inputs and 0 outputs, we use register.DoFn4x0 and provide
    91  	// its input/output types as its constraints.
    92  	// Struct DoFns must be registered for a pipeline to run.
    93  	register.DoFn4x0[context.Context, string, int, func(string, int)](&filterFn{})
    94  	// For simple functional (non-struct) DoFns we can use register.FunctionXxY to perform the same registration without
    95  	// providing type constraints.
    96  	register.Function2x0(extractFn)
    97  	register.Function2x1(formatFn)
    98  	// register.EmitterX is optional and will provide some optimization to make things run faster. Any emitters
    99  	// (functions that produce output for the next step) should be registered. Here we register all emitters with
   100  	// the signature func(string, int).
   101  	register.Emitter2[string, int]()
   102  }
   103  
   104  // filterFn is a DoFn for filtering out certain words.
   105  type filterFn struct {
   106  	// Filter is a regex that is serialized as json and available at runtime.
   107  	// Such fields must be exported.
   108  	Filter string `json:"filter"`
   109  
   110  	re *regexp.Regexp
   111  }
   112  
   113  func (f *filterFn) Setup() {
   114  	f.re = regexp.MustCompile(f.Filter)
   115  }
   116  
   117  // Concept #2: The Beam log package should used for all logging in runtime
   118  // functions. The needed context is made available as an argument.
   119  
   120  func (f *filterFn) ProcessElement(ctx context.Context, word string, count int, emit func(string, int)) {
   121  	if f.re.MatchString(word) {
   122  		// Log at the "INFO" level each element that we match.
   123  		log.Infof(ctx, "Matched: %v", word)
   124  		emit(word, count)
   125  	} else {
   126  		// Log at the "DEBUG" level each element that is not matched.
   127  		log.Debugf(ctx, "Did not match: %v", word)
   128  	}
   129  }
   130  
   131  // The below transforms are identical to the wordcount versions. If this was
   132  // production code, common transforms would be placed in a separate package
   133  // and shared directly rather than being copied.
   134  
   135  var wordRE = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`)
   136  
   137  // extractFn is a DoFn that emits the words in a given line.
   138  func extractFn(line string, emit func(string)) {
   139  	for _, word := range wordRE.FindAllString(line, -1) {
   140  		emit(word)
   141  	}
   142  }
   143  
   144  // formatFn is a DoFn that formats a word and its count as a string.
   145  func formatFn(w string, c int) string {
   146  	return fmt.Sprintf("%s: %v", w, c)
   147  }
   148  
   149  // CountWords is a composite transform that counts the words of an PCollection
   150  // of lines. It expects a PCollection of type string and returns a PCollection
   151  // of type KV<string,int>.
   152  func CountWords(s beam.Scope, lines beam.PCollection) beam.PCollection {
   153  	s = s.Scope("CountWords")
   154  	col := beam.ParDo(s, extractFn, lines)
   155  	return stats.Count(s, col)
   156  }
   157  
   158  func main() {
   159  	flag.Parse()
   160  	beam.Init()
   161  
   162  	// Concept #2: the beam logging package works both during pipeline
   163  	// construction and at runtime. It should always be used.
   164  	ctx := context.Background()
   165  	if *output == "" {
   166  		log.Exit(ctx, "No output provided")
   167  	}
   168  	if _, err := regexp.Compile(*filter); err != nil {
   169  		log.Exitf(ctx, "Invalid filter: %v", err)
   170  	}
   171  
   172  	p := beam.NewPipeline()
   173  	s := p.Root()
   174  
   175  	lines := textio.Read(s, *input)
   176  	counted := CountWords(s, lines)
   177  	filtered := beam.ParDo(s, &filterFn{Filter: *filter}, counted)
   178  	formatted := beam.ParDo(s, formatFn, filtered)
   179  
   180  	// Concept #3: passert is a set of convenient PTransforms that can be used
   181  	// when writing Pipeline level tests to validate the contents of
   182  	// PCollections. passert is best used in unit tests with small data sets
   183  	// but is demonstrated here as a teaching tool.
   184  
   185  	passert.Equals(s, formatted, "Flourish: 3", "stomach: 1")
   186  
   187  	if err := beamx.Run(ctx, p); err != nil {
   188  		log.Exitf(ctx, "Failed to execute job: %v", err)
   189  	}
   190  }