github.com/apache/beam/sdks/v2@v2.48.2/go/examples/minimal_wordcount/minimal_wordcount.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one or more
     2  // contributor license agreements.  See the NOTICE file distributed with
     3  // this work for additional information regarding copyright ownership.
     4  // The ASF licenses this file to You under the Apache License, Version 2.0
     5  // (the "License"); you may not use this file except in compliance with
     6  // the License.  You may obtain a copy of the License at
     7  //
     8  //    http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  
    16  // minimal_wordcount is an example that counts words in King Lear,
    17  // by William Shakespeare.
    18  //
    19  // This example is the first in a series of four successively more detailed
    20  // 'word count' examples. Here, for simplicity, we don't show any
    21  // error-checking or argument processing, and focus on construction of the
    22  // pipeline, which chains together the application of core transforms.
    23  //
    24  // Next, see the wordcount pipeline, then the debugging_wordcount pipeline, and
    25  // finally the windowed_wordcount pipeline, for more detailed examples that
    26  // introduce additional concepts.
    27  //
    28  // Concepts:
    29  //
    30  //  1. Reading data from text files
    31  //  2. Specifying 'inline' transforms
    32  //  3. Counting items in a PCollection
    33  //  4. Writing data to text files
    34  //
    35  // No arguments are required to run this pipeline. It will be executed with
    36  // the direct runner. You can see the results in the output file named
    37  // "wordcounts.txt" in your current working directory.
    38  package main
    39  
    40  // beam-playground:
    41  //   name: MinimalWordCount
    42  //   description: An example that counts words in King Lear,
    43  //     by William Shakespeare.
    44  //   multifile: false
    45  //   default_example: true
    46  //   context_line: 69
    47  //   categories:
    48  //     - IO
    49  //     - Combiners
    50  //     - Core Transforms
    51  //     - Quickstart
    52  //   complexity: BASIC
    53  //   tags:
    54  //     - count
    55  //     - io
    56  //     - strings
    57  
    58  import (
    59  	"context"
    60  	"fmt"
    61  	"regexp"
    62  
    63  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
    64  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio"
    65  	"github.com/apache/beam/sdks/v2/go/pkg/beam/runners/direct"
    66  	"github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats"
    67  
    68  	_ "github.com/apache/beam/sdks/v2/go/pkg/beam/io/filesystem/gcs"
    69  	_ "github.com/apache/beam/sdks/v2/go/pkg/beam/io/filesystem/local"
    70  )
    71  
    72  var wordRE = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`)
    73  
    74  func main() {
    75  	// beam.Init() is an initialization hook that must be called on startup.
    76  	beam.Init()
    77  
    78  	// Create the Pipeline object and root scope.
    79  	p := beam.NewPipeline()
    80  	s := p.Root()
    81  
    82  	// Apply the pipeline's transforms.
    83  
    84  	// Concept #1: Invoke a root transform with the pipeline; in this case,
    85  	// textio.Read to read a set of input text file. textio.Read returns a
    86  	// PCollection where each element is one line from the input text
    87  	// (one of Shakespeare's texts).
    88  
    89  	// This example reads from a public dataset containing the text
    90  	// of King Lear.
    91  	lines := textio.Read(s, "gs://apache-beam-samples/shakespeare/kinglear.txt")
    92  
    93  	// Concept #2: Invoke a ParDo transform on our PCollection of text lines.
    94  	// This ParDo invokes a DoFn (defined in-line) on each element that
    95  	// tokenizes the text line into individual words. The ParDo returns a
    96  	// PCollection of type string, where each element is an individual word in
    97  	// Shakespeare's collected texts.
    98  	words := beam.ParDo(s, func(line string, emit func(string)) {
    99  		for _, word := range wordRE.FindAllString(line, -1) {
   100  			emit(word)
   101  		}
   102  	}, lines)
   103  
   104  	// Concept #3: Invoke the stats.Count transform on our PCollection of
   105  	// individual words. The Count transform returns a new PCollection of
   106  	// key/value pairs, where each key represents a unique word in the text.
   107  	// The associated value is the occurrence count for that word.
   108  	counted := stats.Count(s, words)
   109  
   110  	// Use a ParDo to format our PCollection of word counts into a printable
   111  	// string, suitable for writing to an output file. When each element
   112  	// produces exactly one element, the DoFn can simply return it.
   113  	formatted := beam.ParDo(s, func(w string, c int) string {
   114  		return fmt.Sprintf("%s: %v", w, c)
   115  	}, counted)
   116  
   117  	// Concept #4: Invoke textio.Write at the end of the pipeline to write
   118  	// the contents of a PCollection (in this case, our PCollection of
   119  	// formatted strings) to a text file.
   120  	textio.Write(s, "wordcounts.txt", formatted)
   121  
   122  	// Run the pipeline on the direct runner.
   123  	direct.Execute(context.Background(), p)
   124  }