github.com/apache/beam/sdks/v2@v2.48.2/go/examples/wordcount/wordcount.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one or more
     2  // contributor license agreements.  See the NOTICE file distributed with
     3  // this work for additional information regarding copyright ownership.
     4  // The ASF licenses this file to You under the Apache License, Version 2.0
     5  // (the "License"); you may not use this file except in compliance with
     6  // the License.  You may obtain a copy of the License at
     7  //
     8  //    http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  
    16  // wordcount is an example that counts words in Shakespeare and demonstrates
    17  // Beam best practices.
    18  //
    19  // This example is the second in a series of four successively more detailed
    20  // 'word count' examples. You may first want to take a look at
    21  // minimal_wordcount. After you've looked at this example, see the
    22  // debugging_wordcount pipeline for introduction of additional concepts.
    23  //
    24  // For a detailed walkthrough of this example, see
    25  //
    26  //	https://beam.apache.org/get-started/wordcount-example/
    27  //
    28  // Basic concepts, also in the minimal_wordcount example: reading text files;
    29  // counting a PCollection; writing to text files.
    30  //
    31  // New concepts:
    32  //
    33  //  1. Executing a pipeline both locally and using the selected runner
    34  //  2. Defining your own pipeline options
    35  //  3. Using ParDo with static DoFns defined out-of-line
    36  //  4. Building a composite transform
    37  //
    38  // Concept #1: You can execute this pipeline either locally or by
    39  // selecting another runner. These are now command-line options added by
    40  // the 'beamx' package and not hard-coded as they were in the minimal_wordcount
    41  // example. The 'beamx' package also registers all included runners and
    42  // filesystems as a convenience.
    43  //
    44  // To change the runner, specify:
    45  //
    46  //	--runner=YOUR_SELECTED_RUNNER
    47  //
    48  // To execute this pipeline, specify a local output file (if using the
    49  // 'direct' runner) or a remote file on a supported distributed file system.
    50  //
    51  //	--output=[YOUR_LOCAL_FILE | YOUR_REMOTE_FILE]
    52  //
    53  // The input file defaults to a public data set containing the text of King
    54  // Lear by William Shakespeare. You can override it and choose your own input
    55  // with --input.
    56  package main
    57  
    58  // beam-playground:
    59  //   name: WordCount
    60  //   description: An example that counts words in Shakespeare's works.
    61  //   multifile: false
    62  //   pipeline_options: --output output.txt
    63  //   context_line: 120
    64  //   categories:
    65  //     - Combiners
    66  //     - Options
    67  //     - Quickstart
    68  //   complexity: MEDIUM
    69  //   tags:
    70  //     - count
    71  //     - io
    72  //     - strings
    73  
    74  import (
    75  	"context"
    76  	"flag"
    77  	"fmt"
    78  	"log"
    79  	"regexp"
    80  	"strings"
    81  
    82  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
    83  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio"
    84  	"github.com/apache/beam/sdks/v2/go/pkg/beam/register"
    85  	"github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats"
    86  	"github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx"
    87  )
    88  
    89  // Concept #2: Defining your own configuration options. Pipeline options can
    90  // be standard Go flags, or they can be obtained any other way. Defining and
    91  // configuring the pipeline is normal Go code.
    92  var (
    93  	// By default, this example reads from a public dataset containing the text of
    94  	// King Lear. Set this option to choose a different input file or glob.
    95  	input = flag.String("input", "gs://apache-beam-samples/shakespeare/kinglear.txt", "File(s) to read.")
    96  
    97  	// Set this required option to specify where to write the output.
    98  	output = flag.String("output", "", "Output file (required).")
    99  )
   100  
   101  // Concept #3: You can make your pipeline assembly code less verbose by
   102  // defining your DoFns statically out-of-line. A DoFn can be defined as a Go
   103  // function and is conventionally suffixed "Fn". Using named function
   104  // transforms allows for easy reuse, modular testing, and an improved monitoring
   105  // experience. The argument and return types of a function dictate the pipeline
   106  // shape when used in a ParDo. For example,
   107  //
   108  //	func formatFn(w string, c int) string
   109  //
   110  // indicates that the function operates on a PCollection of type KV<string,int>,
   111  // representing key value pairs of strings and ints, and outputs a PCollection
   112  // of type string. Beam typechecks the pipeline before running it.
   113  //
   114  // DoFns that potentially output zero or multiple elements can also be Go
   115  // functions, but have a different signature. For example,
   116  //
   117  //	func extractFn(w string, emit func(string))
   118  //
   119  // uses an "emit" function argument instead of a string return type to allow it
   120  // to output any number of elements. It operates on a PCollection of type string
   121  // and returns a PCollection of type string.
   122  //
   123  // DoFns must be registered with Beam in order to be executed in ParDos. This is
   124  // done automatically by the starcgen code generator, or it can be done manually
   125  // by calling beam.RegisterFunction in an init() call.
   126  func init() {
   127  	// register.DoFnXxY registers a struct DoFn so that it can be correctly
   128  	// serialized and does some optimization to avoid runtime reflection. Since
   129  	// extractFn has 3 inputs and 0 outputs, we use register.DoFn3x0 and provide
   130  	// its input types as its constraints (if it had any outputs, we would add
   131  	// those as constraints as well). Struct DoFns must be registered for a
   132  	// pipeline to run.
   133  	register.DoFn3x0[context.Context, string, func(string)](&extractFn{})
   134  	// register.FunctionXxY registers a functional DoFn to optimize execution at
   135  	// runtime. formatFn has 2 inputs and 1 output, so we use
   136  	// register.Function2x1.
   137  	register.Function2x1(formatFn)
   138  	// register.EmitterX is optional and will provide some optimization to make
   139  	// things run faster. Any emitters (functions that produce output for the next
   140  	// step) should be registered. Here we register all emitters with the
   141  	// signature func(string).
   142  	register.Emitter1[string]()
   143  }
   144  
   145  var (
   146  	wordRE          = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`)
   147  	empty           = beam.NewCounter("extract", "emptyLines")
   148  	smallWordLength = flag.Int("small_word_length", 9, "length of small words (default: 9)")
   149  	smallWords      = beam.NewCounter("extract", "smallWords")
   150  	lineLen         = beam.NewDistribution("extract", "lineLenDistro")
   151  )
   152  
   153  // extractFn is a structural DoFn that emits the words in a given line and keeps
   154  // a count for small words. Its ProcessElement function will be invoked on each
   155  // element in the input PCollection.
   156  type extractFn struct {
   157  	SmallWordLength int `json:"smallWordLength"`
   158  }
   159  
   160  func (f *extractFn) ProcessElement(ctx context.Context, line string, emit func(string)) {
   161  	lineLen.Update(ctx, int64(len(line)))
   162  	if len(strings.TrimSpace(line)) == 0 {
   163  		empty.Inc(ctx, 1)
   164  	}
   165  	for _, word := range wordRE.FindAllString(line, -1) {
   166  		// increment the counter for small words if length of words is
   167  		// less than small_word_length
   168  		if len(word) < f.SmallWordLength {
   169  			smallWords.Inc(ctx, 1)
   170  		}
   171  		emit(word)
   172  	}
   173  }
   174  
   175  // formatFn is a functional DoFn that formats a word and its count as a string.
   176  func formatFn(w string, c int) string {
   177  	return fmt.Sprintf("%s: %v", w, c)
   178  }
   179  
   180  // Concept #4: A composite PTransform is a Go function that adds
   181  // transformations to a given pipeline. It is run at construction time and
   182  // works on PCollections as values. For monitoring purposes, the pipeline
   183  // allows scoped naming for composite transforms. The difference between a
   184  // composite transform and a construction helper function is solely in whether
   185  // a scoped name is used.
   186  //
   187  // For example, the CountWords function is a custom composite transform that
   188  // bundles two transforms (ParDo and Count) as a reusable function.
   189  
   190  // CountWords is a composite transform that counts the words of a PCollection
   191  // of lines. It expects a PCollection of type string and returns a PCollection
   192  // of type KV<string,int>. The Beam type checker enforces these constraints
   193  // during pipeline construction.
   194  func CountWords(s beam.Scope, lines beam.PCollection) beam.PCollection {
   195  	s = s.Scope("CountWords")
   196  
   197  	// Convert lines of text into individual words.
   198  	col := beam.ParDo(s, &extractFn{SmallWordLength: *smallWordLength}, lines)
   199  
   200  	// Count the number of times each word occurs.
   201  	return stats.Count(s, col)
   202  }
   203  
   204  func main() {
   205  	// If beamx or Go flags are used, flags must be parsed first.
   206  	flag.Parse()
   207  	// beam.Init() is an initialization hook that must be called on startup. On
   208  	// distributed runners, it is used to intercept control.
   209  	beam.Init()
   210  
   211  	// Input validation is done as usual. Note that it must be after Init().
   212  	if *output == "" {
   213  		log.Fatal("No output provided")
   214  	}
   215  
   216  	// Concepts #3 and #4: The pipeline uses the named transform and DoFn.
   217  	p := beam.NewPipeline()
   218  	s := p.Root()
   219  
   220  	lines := textio.Read(s, *input)
   221  	counted := CountWords(s, lines)
   222  	formatted := beam.ParDo(s, formatFn, counted)
   223  	textio.Write(s, *output, formatted)
   224  
   225  	// Concept #1: The beamx.Run convenience wrapper allows a number of
   226  	// pre-defined runners to be used via the --runner flag.
   227  	if err := beamx.Run(context.Background(), p); err != nil {
   228  		log.Fatalf("Failed to execute job: %v", err)
   229  	}
   230  }