github.com/apache/beam/sdks/v2@v2.48.2/go/examples/wordcount/wordcount.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // wordcount is an example that counts words in Shakespeare and demonstrates 17 // Beam best practices. 18 // 19 // This example is the second in a series of four successively more detailed 20 // 'word count' examples. You may first want to take a look at 21 // minimal_wordcount. After you've looked at this example, see the 22 // debugging_wordcount pipeline for introduction of additional concepts. 23 // 24 // For a detailed walkthrough of this example, see 25 // 26 // https://beam.apache.org/get-started/wordcount-example/ 27 // 28 // Basic concepts, also in the minimal_wordcount example: reading text files; 29 // counting a PCollection; writing to text files. 30 // 31 // New concepts: 32 // 33 // 1. Executing a pipeline both locally and using the selected runner 34 // 2. Defining your own pipeline options 35 // 3. Using ParDo with static DoFns defined out-of-line 36 // 4. Building a composite transform 37 // 38 // Concept #1: You can execute this pipeline either locally or by 39 // selecting another runner. These are now command-line options added by 40 // the 'beamx' package and not hard-coded as they were in the minimal_wordcount 41 // example. The 'beamx' package also registers all included runners and 42 // filesystems as a convenience. 43 // 44 // To change the runner, specify: 45 // 46 // --runner=YOUR_SELECTED_RUNNER 47 // 48 // To execute this pipeline, specify a local output file (if using the 49 // 'direct' runner) or a remote file on a supported distributed file system. 50 // 51 // --output=[YOUR_LOCAL_FILE | YOUR_REMOTE_FILE] 52 // 53 // The input file defaults to a public data set containing the text of King 54 // Lear by William Shakespeare. You can override it and choose your own input 55 // with --input. 56 package main 57 58 // beam-playground: 59 // name: WordCount 60 // description: An example that counts words in Shakespeare's works. 61 // multifile: false 62 // pipeline_options: --output output.txt 63 // context_line: 120 64 // categories: 65 // - Combiners 66 // - Options 67 // - Quickstart 68 // complexity: MEDIUM 69 // tags: 70 // - count 71 // - io 72 // - strings 73 74 import ( 75 "context" 76 "flag" 77 "fmt" 78 "log" 79 "regexp" 80 "strings" 81 82 "github.com/apache/beam/sdks/v2/go/pkg/beam" 83 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio" 84 "github.com/apache/beam/sdks/v2/go/pkg/beam/register" 85 "github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats" 86 "github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx" 87 ) 88 89 // Concept #2: Defining your own configuration options. Pipeline options can 90 // be standard Go flags, or they can be obtained any other way. Defining and 91 // configuring the pipeline is normal Go code. 92 var ( 93 // By default, this example reads from a public dataset containing the text of 94 // King Lear. Set this option to choose a different input file or glob. 95 input = flag.String("input", "gs://apache-beam-samples/shakespeare/kinglear.txt", "File(s) to read.") 96 97 // Set this required option to specify where to write the output. 98 output = flag.String("output", "", "Output file (required).") 99 ) 100 101 // Concept #3: You can make your pipeline assembly code less verbose by 102 // defining your DoFns statically out-of-line. A DoFn can be defined as a Go 103 // function and is conventionally suffixed "Fn". Using named function 104 // transforms allows for easy reuse, modular testing, and an improved monitoring 105 // experience. The argument and return types of a function dictate the pipeline 106 // shape when used in a ParDo. For example, 107 // 108 // func formatFn(w string, c int) string 109 // 110 // indicates that the function operates on a PCollection of type KV<string,int>, 111 // representing key value pairs of strings and ints, and outputs a PCollection 112 // of type string. Beam typechecks the pipeline before running it. 113 // 114 // DoFns that potentially output zero or multiple elements can also be Go 115 // functions, but have a different signature. For example, 116 // 117 // func extractFn(w string, emit func(string)) 118 // 119 // uses an "emit" function argument instead of a string return type to allow it 120 // to output any number of elements. It operates on a PCollection of type string 121 // and returns a PCollection of type string. 122 // 123 // DoFns must be registered with Beam in order to be executed in ParDos. This is 124 // done automatically by the starcgen code generator, or it can be done manually 125 // by calling beam.RegisterFunction in an init() call. 126 func init() { 127 // register.DoFnXxY registers a struct DoFn so that it can be correctly 128 // serialized and does some optimization to avoid runtime reflection. Since 129 // extractFn has 3 inputs and 0 outputs, we use register.DoFn3x0 and provide 130 // its input types as its constraints (if it had any outputs, we would add 131 // those as constraints as well). Struct DoFns must be registered for a 132 // pipeline to run. 133 register.DoFn3x0[context.Context, string, func(string)](&extractFn{}) 134 // register.FunctionXxY registers a functional DoFn to optimize execution at 135 // runtime. formatFn has 2 inputs and 1 output, so we use 136 // register.Function2x1. 137 register.Function2x1(formatFn) 138 // register.EmitterX is optional and will provide some optimization to make 139 // things run faster. Any emitters (functions that produce output for the next 140 // step) should be registered. Here we register all emitters with the 141 // signature func(string). 142 register.Emitter1[string]() 143 } 144 145 var ( 146 wordRE = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`) 147 empty = beam.NewCounter("extract", "emptyLines") 148 smallWordLength = flag.Int("small_word_length", 9, "length of small words (default: 9)") 149 smallWords = beam.NewCounter("extract", "smallWords") 150 lineLen = beam.NewDistribution("extract", "lineLenDistro") 151 ) 152 153 // extractFn is a structural DoFn that emits the words in a given line and keeps 154 // a count for small words. Its ProcessElement function will be invoked on each 155 // element in the input PCollection. 156 type extractFn struct { 157 SmallWordLength int `json:"smallWordLength"` 158 } 159 160 func (f *extractFn) ProcessElement(ctx context.Context, line string, emit func(string)) { 161 lineLen.Update(ctx, int64(len(line))) 162 if len(strings.TrimSpace(line)) == 0 { 163 empty.Inc(ctx, 1) 164 } 165 for _, word := range wordRE.FindAllString(line, -1) { 166 // increment the counter for small words if length of words is 167 // less than small_word_length 168 if len(word) < f.SmallWordLength { 169 smallWords.Inc(ctx, 1) 170 } 171 emit(word) 172 } 173 } 174 175 // formatFn is a functional DoFn that formats a word and its count as a string. 176 func formatFn(w string, c int) string { 177 return fmt.Sprintf("%s: %v", w, c) 178 } 179 180 // Concept #4: A composite PTransform is a Go function that adds 181 // transformations to a given pipeline. It is run at construction time and 182 // works on PCollections as values. For monitoring purposes, the pipeline 183 // allows scoped naming for composite transforms. The difference between a 184 // composite transform and a construction helper function is solely in whether 185 // a scoped name is used. 186 // 187 // For example, the CountWords function is a custom composite transform that 188 // bundles two transforms (ParDo and Count) as a reusable function. 189 190 // CountWords is a composite transform that counts the words of a PCollection 191 // of lines. It expects a PCollection of type string and returns a PCollection 192 // of type KV<string,int>. The Beam type checker enforces these constraints 193 // during pipeline construction. 194 func CountWords(s beam.Scope, lines beam.PCollection) beam.PCollection { 195 s = s.Scope("CountWords") 196 197 // Convert lines of text into individual words. 198 col := beam.ParDo(s, &extractFn{SmallWordLength: *smallWordLength}, lines) 199 200 // Count the number of times each word occurs. 201 return stats.Count(s, col) 202 } 203 204 func main() { 205 // If beamx or Go flags are used, flags must be parsed first. 206 flag.Parse() 207 // beam.Init() is an initialization hook that must be called on startup. On 208 // distributed runners, it is used to intercept control. 209 beam.Init() 210 211 // Input validation is done as usual. Note that it must be after Init(). 212 if *output == "" { 213 log.Fatal("No output provided") 214 } 215 216 // Concepts #3 and #4: The pipeline uses the named transform and DoFn. 217 p := beam.NewPipeline() 218 s := p.Root() 219 220 lines := textio.Read(s, *input) 221 counted := CountWords(s, lines) 222 formatted := beam.ParDo(s, formatFn, counted) 223 textio.Write(s, *output, formatted) 224 225 // Concept #1: The beamx.Run convenience wrapper allows a number of 226 // pre-defined runners to be used via the --runner flag. 227 if err := beamx.Run(context.Background(), p); err != nil { 228 log.Fatalf("Failed to execute job: %v", err) 229 } 230 }