github.com/apache/beam/sdks/v2@v2.48.2/go/examples/debugging_wordcount/debugging_wordcount.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // debugging_wordcount is an example that verifies word counts in Shakespeare 17 // and includes Beam best practices. 18 // 19 // This example, debugging_wordcount, is the third in a series of four 20 // successively more detailed 'word count' examples. You may first want to 21 // take a look at minimal_wordcount and wordcount. After you've looked at 22 // this example, then see the windowed_wordcount pipeline, for introduction 23 // of additional concepts. 24 // 25 // Basic concepts, also in the minimal_wordcount and wordcount examples: 26 // Reading text files; counting a PCollection; executing a Pipeline both locally 27 // and using a selected runner; defining DoFns. 28 // 29 // New Concepts: 30 // 31 // 1. Using the richer struct DoFn form and accessing optional arguments. 32 // 2. Logging using the Beam log package, even in a distributed environment 33 // 3. Testing your Pipeline via passert 34 // 35 // To change the runner, specify: 36 // 37 // --runner=YOUR_SELECTED_RUNNER 38 // 39 // The input file defaults to a public data set containing the text of King 40 // Lear, by William Shakespeare. You can override it and choose your own input 41 // with --input. 42 package main 43 44 // beam-playground: 45 // name: DebuggingWordCount 46 // description: An example that counts words in Shakespeare's works includes regex filter("Flourish|stomach"). 47 // multifile: false 48 // pipeline_options: --output output.txt 49 // context_line: 83 50 // categories: 51 // - Options 52 // - Filtering 53 // - Debugging 54 // - Quickstart 55 // complexity: MEDIUM 56 // tags: 57 // - count 58 // - io 59 // - strings 60 61 import ( 62 "context" 63 "flag" 64 "fmt" 65 "regexp" 66 67 "github.com/apache/beam/sdks/v2/go/pkg/beam" 68 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio" 69 "github.com/apache/beam/sdks/v2/go/pkg/beam/log" 70 "github.com/apache/beam/sdks/v2/go/pkg/beam/register" 71 "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/passert" 72 "github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats" 73 "github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx" 74 ) 75 76 // TODO(herohde) 10/16/2017: support metrics and log level cutoff. 77 78 var ( 79 input = flag.String("input", "gs://apache-beam-samples/shakespeare/kinglear.txt", "File(s) to read.") 80 filter = flag.String("filter", "Flourish|stomach", "Regex filter pattern to use. Only words matching this pattern will be included.") 81 output = flag.String("output", "", "Output file (required).") 82 ) 83 84 // Concept #1: a DoFn can also be a struct with methods for setup/teardown and 85 // element/bundle processing. It also allows configuration values to be made 86 // available at runtime. 87 88 func init() { 89 // register.DoFnXxY registers a struct DoFn so that it can be correctly serialized and does some optimization 90 // to avoid runtime reflection. Since addTimestampFn has 4 inputs and 0 outputs, we use register.DoFn4x0 and provide 91 // its input/output types as its constraints. 92 // Struct DoFns must be registered for a pipeline to run. 93 register.DoFn4x0[context.Context, string, int, func(string, int)](&filterFn{}) 94 // For simple functional (non-struct) DoFns we can use register.FunctionXxY to perform the same registration without 95 // providing type constraints. 96 register.Function2x0(extractFn) 97 register.Function2x1(formatFn) 98 // register.EmitterX is optional and will provide some optimization to make things run faster. Any emitters 99 // (functions that produce output for the next step) should be registered. Here we register all emitters with 100 // the signature func(string, int). 101 register.Emitter2[string, int]() 102 } 103 104 // filterFn is a DoFn for filtering out certain words. 105 type filterFn struct { 106 // Filter is a regex that is serialized as json and available at runtime. 107 // Such fields must be exported. 108 Filter string `json:"filter"` 109 110 re *regexp.Regexp 111 } 112 113 func (f *filterFn) Setup() { 114 f.re = regexp.MustCompile(f.Filter) 115 } 116 117 // Concept #2: The Beam log package should used for all logging in runtime 118 // functions. The needed context is made available as an argument. 119 120 func (f *filterFn) ProcessElement(ctx context.Context, word string, count int, emit func(string, int)) { 121 if f.re.MatchString(word) { 122 // Log at the "INFO" level each element that we match. 123 log.Infof(ctx, "Matched: %v", word) 124 emit(word, count) 125 } else { 126 // Log at the "DEBUG" level each element that is not matched. 127 log.Debugf(ctx, "Did not match: %v", word) 128 } 129 } 130 131 // The below transforms are identical to the wordcount versions. If this was 132 // production code, common transforms would be placed in a separate package 133 // and shared directly rather than being copied. 134 135 var wordRE = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`) 136 137 // extractFn is a DoFn that emits the words in a given line. 138 func extractFn(line string, emit func(string)) { 139 for _, word := range wordRE.FindAllString(line, -1) { 140 emit(word) 141 } 142 } 143 144 // formatFn is a DoFn that formats a word and its count as a string. 145 func formatFn(w string, c int) string { 146 return fmt.Sprintf("%s: %v", w, c) 147 } 148 149 // CountWords is a composite transform that counts the words of an PCollection 150 // of lines. It expects a PCollection of type string and returns a PCollection 151 // of type KV<string,int>. 152 func CountWords(s beam.Scope, lines beam.PCollection) beam.PCollection { 153 s = s.Scope("CountWords") 154 col := beam.ParDo(s, extractFn, lines) 155 return stats.Count(s, col) 156 } 157 158 func main() { 159 flag.Parse() 160 beam.Init() 161 162 // Concept #2: the beam logging package works both during pipeline 163 // construction and at runtime. It should always be used. 164 ctx := context.Background() 165 if *output == "" { 166 log.Exit(ctx, "No output provided") 167 } 168 if _, err := regexp.Compile(*filter); err != nil { 169 log.Exitf(ctx, "Invalid filter: %v", err) 170 } 171 172 p := beam.NewPipeline() 173 s := p.Root() 174 175 lines := textio.Read(s, *input) 176 counted := CountWords(s, lines) 177 filtered := beam.ParDo(s, &filterFn{Filter: *filter}, counted) 178 formatted := beam.ParDo(s, formatFn, filtered) 179 180 // Concept #3: passert is a set of convenient PTransforms that can be used 181 // when writing Pipeline level tests to validate the contents of 182 // PCollections. passert is best used in unit tests with small data sets 183 // but is demonstrated here as a teaching tool. 184 185 passert.Equals(s, formatted, "Flourish: 3", "stomach: 1") 186 187 if err := beamx.Run(ctx, p); err != nil { 188 log.Exitf(ctx, "Failed to execute job: %v", err) 189 } 190 }