github.com/apache/beam/sdks/v2@v2.48.2/go/examples/minimal_wordcount/minimal_wordcount.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // minimal_wordcount is an example that counts words in King Lear, 17 // by William Shakespeare. 18 // 19 // This example is the first in a series of four successively more detailed 20 // 'word count' examples. Here, for simplicity, we don't show any 21 // error-checking or argument processing, and focus on construction of the 22 // pipeline, which chains together the application of core transforms. 23 // 24 // Next, see the wordcount pipeline, then the debugging_wordcount pipeline, and 25 // finally the windowed_wordcount pipeline, for more detailed examples that 26 // introduce additional concepts. 27 // 28 // Concepts: 29 // 30 // 1. Reading data from text files 31 // 2. Specifying 'inline' transforms 32 // 3. Counting items in a PCollection 33 // 4. Writing data to text files 34 // 35 // No arguments are required to run this pipeline. It will be executed with 36 // the direct runner. You can see the results in the output file named 37 // "wordcounts.txt" in your current working directory. 38 package main 39 40 // beam-playground: 41 // name: MinimalWordCount 42 // description: An example that counts words in King Lear, 43 // by William Shakespeare. 44 // multifile: false 45 // default_example: true 46 // context_line: 69 47 // categories: 48 // - IO 49 // - Combiners 50 // - Core Transforms 51 // - Quickstart 52 // complexity: BASIC 53 // tags: 54 // - count 55 // - io 56 // - strings 57 58 import ( 59 "context" 60 "fmt" 61 "regexp" 62 63 "github.com/apache/beam/sdks/v2/go/pkg/beam" 64 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio" 65 "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/direct" 66 "github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats" 67 68 _ "github.com/apache/beam/sdks/v2/go/pkg/beam/io/filesystem/gcs" 69 _ "github.com/apache/beam/sdks/v2/go/pkg/beam/io/filesystem/local" 70 ) 71 72 var wordRE = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`) 73 74 func main() { 75 // beam.Init() is an initialization hook that must be called on startup. 76 beam.Init() 77 78 // Create the Pipeline object and root scope. 79 p := beam.NewPipeline() 80 s := p.Root() 81 82 // Apply the pipeline's transforms. 83 84 // Concept #1: Invoke a root transform with the pipeline; in this case, 85 // textio.Read to read a set of input text file. textio.Read returns a 86 // PCollection where each element is one line from the input text 87 // (one of Shakespeare's texts). 88 89 // This example reads from a public dataset containing the text 90 // of King Lear. 91 lines := textio.Read(s, "gs://apache-beam-samples/shakespeare/kinglear.txt") 92 93 // Concept #2: Invoke a ParDo transform on our PCollection of text lines. 94 // This ParDo invokes a DoFn (defined in-line) on each element that 95 // tokenizes the text line into individual words. The ParDo returns a 96 // PCollection of type string, where each element is an individual word in 97 // Shakespeare's collected texts. 98 words := beam.ParDo(s, func(line string, emit func(string)) { 99 for _, word := range wordRE.FindAllString(line, -1) { 100 emit(word) 101 } 102 }, lines) 103 104 // Concept #3: Invoke the stats.Count transform on our PCollection of 105 // individual words. The Count transform returns a new PCollection of 106 // key/value pairs, where each key represents a unique word in the text. 107 // The associated value is the occurrence count for that word. 108 counted := stats.Count(s, words) 109 110 // Use a ParDo to format our PCollection of word counts into a printable 111 // string, suitable for writing to an output file. When each element 112 // produces exactly one element, the DoFn can simply return it. 113 formatted := beam.ParDo(s, func(w string, c int) string { 114 return fmt.Sprintf("%s: %v", w, c) 115 }, counted) 116 117 // Concept #4: Invoke textio.Write at the end of the pipeline to write 118 // the contents of a PCollection (in this case, our PCollection of 119 // formatted strings) to a text file. 120 textio.Write(s, "wordcounts.txt", formatted) 121 122 // Run the pipeline on the direct runner. 123 direct.Execute(context.Background(), p) 124 }