github.com/apache/beam/sdks/v2@v2.48.2/go/examples/large_wordcount/large_wordcount.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // large_wordcount is an example that demonstrates a more complex version 17 // of a wordcount pipeline. It uses a SplittableDoFn for reading the 18 // text files, then uses a map side input to build sorted shards. 19 // 20 // This example, large_wordcount, is the fourth in a series of five 21 // successively more detailed 'word count' examples. You may first want to 22 // take a look at minimal_wordcount and wordcount. 23 // Then look at debugging_worcount for some testing and validation concepts. 24 // After you've looked at this example, follow up with the windowed_wordcount 25 // pipeline, for introduction of additional concepts. 26 // 27 // Basic concepts, also in the minimal_wordcount and wordcount examples: 28 // Reading text files; counting a PCollection; executing a Pipeline both locally 29 // and using a selected runner; defining DoFns. 30 // 31 // New Concepts: 32 // 33 // 1. Using a SplittableDoFn transform to read the IOs. 34 // 2. Using a Map Side Input to access values for specific keys. 35 // 3. Testing your Pipeline via passert and metrics, using Go testing tools. 36 // 37 // This example will not be enumerating concepts, but will document them as they 38 // appear. There may be repetition from previous examples. 39 // 40 // To change the runner, specify: 41 // 42 // --runner=YOUR_SELECTED_RUNNER 43 // 44 // The input file defaults to a public data set containing the text of King 45 // Lear, by William Shakespeare. You can override it and choose your own input 46 // with --input. 47 package main 48 49 import ( 50 "context" 51 "flag" 52 "fmt" 53 "regexp" 54 "sort" 55 "strconv" 56 "time" 57 58 "github.com/apache/beam/sdks/v2/go/pkg/beam" 59 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/filesystem" 60 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/rtrackers/offsetrange" 61 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio" 62 "github.com/apache/beam/sdks/v2/go/pkg/beam/log" 63 "github.com/apache/beam/sdks/v2/go/pkg/beam/register" 64 "github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats" 65 66 "github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx" 67 68 _ "github.com/apache/beam/sdks/v2/go/pkg/beam/io/filesystem/gcs" 69 _ "github.com/apache/beam/sdks/v2/go/pkg/beam/io/filesystem/local" 70 71 // The imports here are for the side effect of runner registration. 72 _ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/dataflow" 73 _ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/direct" 74 _ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/dot" 75 _ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/flink" 76 _ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/samza" 77 _ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/spark" 78 _ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/universal" 79 ) 80 81 var ( 82 input = flag.String("input", "gs://apache-beam-samples/shakespeare/*.txt", "File(s) to read.") 83 output = flag.String("output", "", "Output file (required). Use @* or @N (eg. @5) to indicate dynamic, or fixed number of shards. No shard indicator means a single file.") 84 ) 85 86 // Concept: DoFn and Type Registration 87 // All DoFns and user types used as PCollection elements must be registered with beam. 88 89 func init() { 90 register.Function2x0(extractFn) 91 register.Function2x1(formatFn) 92 register.DoFn4x1[context.Context, []byte, func(*string) bool, func(metakey), error](&makeMetakeys{}) 93 94 register.DoFn4x0[context.Context, string, func(*metakey) bool, func(metakey, string)](&pairWithMetakey{}) 95 register.DoFn5x1[context.Context, metakey, func(*string) bool, func(string) func(*int) bool, func(string), error](&writeTempFiles{}) 96 register.DoFn4x1[context.Context, metakey, func(*string) bool, func(string), error](&renameFiles{}) 97 98 register.Emitter1[metakey]() 99 register.Emitter2[metakey, string]() 100 register.Iter1[*string]() 101 register.Iter1[*metakey]() 102 } 103 104 // The below transforms are identical to the wordcount versions. If this was 105 // production code, common transforms would be placed in a separate package 106 // and shared directly rather than being copied. 107 108 var wordRE = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`) 109 110 // extractFn is a DoFn that emits the words in a given line. 111 func extractFn(line string, emit func(string)) { 112 for _, word := range wordRE.FindAllString(line, -1) { 113 emit(word) 114 } 115 } 116 117 // formatFn is a DoFn that formats a word and its count as a string. 118 func formatFn(w string, c int) string { 119 return fmt.Sprintf("%s: %v", w, c) 120 } 121 122 // CountWords is a composite transform that counts the words of an PCollection 123 // of lines. It expects a PCollection of type string and returns a PCollection 124 // of type KV<string,int>. 125 func CountWords(s beam.Scope, lines beam.PCollection) beam.PCollection { 126 s = s.Scope("CountWords") 127 col := beam.ParDo(s, extractFn, lines) 128 return stats.Count(s, col) 129 } 130 131 // SortAndShard is defined earlier in the file, so it can provide an overview of this 132 // complex segment of pipeline. The DoFns that build it up follow. 133 134 // SortAndShard is a composite transform takes in a PCollection<string,int> 135 // and an output pattern. It returns a PCollection<string> with the output file paths. 136 // It demonstrates using a side input, a map side input and producing output. 137 func SortAndShard(s beam.Scope, in beam.PCollection, output string) beam.PCollection { 138 s = s.Scope("SortAndShard") 139 // For the sake of example, we drop the values from the keys. 140 keys := beam.DropValue(s, in) 141 142 // Concept: Impulse and Side Input to process on a single worker. 143 // makeMetakeys is being started with an Impulse, and blocked from starting 144 // until it's side input is ready. This will have all the work done for this 145 // DoFn executed in a single bundle, on a single worker. This requires that 146 // the values fit into memory for a single worker. 147 148 // makeMetakeys divides the data into several shards as determined by the output pattern. 149 // One metakey is produced per shard. 150 metakeys := beam.ParDo(s, &makeMetakeys{Output: output}, beam.Impulse(s), beam.SideInput{Input: keys}) 151 152 // Takes the metakeys, and pairs each key with it's metakey. 153 rekeys := beam.ParDo(s, &pairWithMetakey{}, keys, beam.SideInput{Input: metakeys}) 154 155 // Group all the newly paired values with their metakeys. 156 // This forms the individual shards we will write to files. 157 gbmeta := beam.GroupByKey(s, rekeys) 158 159 // writeTempFiles produces temporary output files with the metakey. 160 // Counts for each word are looked up in the map side input of the 161 // original word + count pairs. 162 tmpFiles := beam.ParDo(s, &writeTempFiles{Output: output}, gbmeta, beam.SideInput{Input: in}) 163 164 // renameFiles takes the tmp files, and renames them to the final destination. 165 // Using temporary names and then renaming is recommended to avoid conflicts on retries, 166 // if the original files fail to write. 167 return beam.ParDo(s, &renameFiles{Output: output}, metakeys, beam.SideInput{Input: tmpFiles}) 168 } 169 170 // metakey serves the purpose of being a key for splitting up input 171 // into distinct shards. 172 type metakey struct { 173 Low, High string 174 Shard, Total int 175 TmpInfix int64 176 } 177 178 // outputRE is a regular expression representing the shard indicator: @* or @<shard count> 179 var outputRE = regexp.MustCompile(`(@\*|@\d+)`) 180 181 // makeTmpInfix converts a unix time into a compact string representation. 182 func makeTmpInfix(v int64) string { 183 return strconv.FormatInt(v, 36) 184 } 185 186 // TmpFileName produces a temporary filename for this meta key, including an infix to 187 // group temporary files from the same run together. 188 func (m *metakey) TmpFileName(output string) string { 189 shard := fmt.Sprintf("%03d-%03d.%s", m.Shard, m.Total, makeTmpInfix(m.TmpInfix)) 190 return outputRE.ReplaceAllString(output, shard) 191 } 192 193 // FinalFileName produces the final file name for this shard. 194 func (m *metakey) FinalFileName(output string) string { 195 shard := fmt.Sprintf("%03d-%03d", m.Shard, m.Total) 196 return outputRE.ReplaceAllString(output, shard) 197 } 198 199 // makeMetakeys produces metakeys for each shard. 200 type makeMetakeys struct { 201 Output string // The format of output files. 202 Dynamic int // The number of elements for each dynamic shard. Default 10k. Ignored if the format doesn't contain `@*`. 203 204 keycount, metakeycount beam.Counter 205 } 206 207 func (fn *makeMetakeys) StartBundle(_ func(*string) bool, _ func(metakey)) { 208 if fn.Dynamic <= 0 { 209 fn.Dynamic = 10000 210 } 211 fn.keycount = beam.NewCounter("wordcount", "keycount") 212 fn.metakeycount = beam.NewCounter("metakeys", "metakeycount") 213 } 214 215 func (fn *makeMetakeys) ProcessElement(ctx context.Context, _ []byte, iter func(*string) bool, emit func(metakey)) error { 216 // Pull in and sort all the keys in memory. 217 var v string 218 var keys []string 219 for iter(&v) { 220 keys = append(keys, v) 221 } 222 sort.StringSlice(keys).Sort() 223 224 // Increment for all the keys at once. 225 fn.keycount.Inc(ctx, int64(len(keys))) 226 227 // Code within DoFns can be arbitrarily complex, 228 // and executes as ordinary code would. 229 230 // first, parse fn.Output for a shard. 231 match := outputRE.FindString(fn.Output) 232 r := offsetrange.Restriction{Start: 0, End: int64(len(keys)) - 1} 233 var rs []offsetrange.Restriction 234 switch match { 235 case "": // No matches 236 // Everything in a single file. 237 rs = append(rs, r) 238 case "@*": // Dynamic Sharding 239 // Basic dynamic sharding, where each file will contain a fixed number of words. 240 rs = r.SizedSplits(int64(fn.Dynamic)) 241 default: // @N Fixed Sharding 242 // Fixed number of shards, where each shard will contain 1/Nth of the words. 243 n, err := strconv.Atoi(match[1:]) 244 if err != nil { 245 return fmt.Errorf("bad output format: Unable to extract shard count from %v: %v", fn.Output, err) 246 } 247 rs = r.EvenSplits(int64(n)) 248 } 249 // Increment the number of expected shards. 250 fn.metakeycount.Inc(ctx, int64(len(rs))) 251 252 // Use the current time in unix as the temp infix. 253 // Since it's included with all metakeys, an int64 is preferable to a string for compactness. 254 tmpInfix := time.Now().Unix() 255 256 // Log the identifier to assist with debugging. 257 log.Infof(ctx, "makeMetakeys: temp file identifier %s used for output path %s", makeTmpInfix(tmpInfix), fn.Output) 258 for s, ri := range rs { 259 emit(metakey{ 260 Low: keys[int(ri.Start)], 261 High: keys[int(ri.End)], 262 Shard: s, 263 Total: len(rs), 264 TmpInfix: tmpInfix, 265 }) 266 } 267 return nil 268 } 269 270 // pairWithMetakey processes each element, and re-emits them with the metakey. 271 // This associates them with each shard of the final output. 272 type pairWithMetakey struct { 273 mks []metakey 274 } 275 276 func (fn *pairWithMetakey) ProcessElement(ctx context.Context, v string, iter func(*metakey) bool, emit func(metakey, string)) { 277 // Read in all the metakeys and sort on the first element. 278 // Since this pipeline runs with the global window, the side input 279 // will not change, so it can be cached in the DoFn. 280 // This will only happen once per bundle. 281 if fn.mks == nil { 282 var mk metakey 283 for iter(&mk) { 284 fn.mks = append(fn.mks, mk) 285 } 286 sort.Slice(fn.mks, func(i, j int) bool { 287 return fn.mks[i].Shard < fn.mks[j].Shard 288 }) 289 } 290 291 n := len(fn.mks) 292 i := sort.Search(n, func(i int) bool { 293 return v <= fn.mks[i].High 294 }) 295 296 emit(fn.mks[i], v) 297 } 298 299 func (fn *pairWithMetakey) FinishBundle(_ func(*metakey) bool, _ func(metakey, string)) { 300 fn.mks = nil // allow the metakeys to be garbage collected when the bundle is finished. 301 } 302 303 // writeTempFiles takes each metakey and it's grouped words (the original keys), and uses 304 // a map side input to lookup the original sum for each word. 305 // 306 // All words for the metakey are sorted in memory and written to a temporary file, outputing 307 // the temporary file name. Each metakey includes a temporary infix used to distinguish 308 // a given attempt's set of files from each other, and the final successful files. 309 // 310 // A more robust implementation would write to the pipeline's temporary folder instead, 311 // but for this example, using the same output destination is sufficient. 312 type writeTempFiles struct { 313 Output string 314 315 fs filesystem.Interface 316 countdistro beam.Distribution 317 } 318 319 func (fn *writeTempFiles) StartBundle(ctx context.Context, _ func(string) func(*int) bool, _ func(string)) error { 320 fs, err := filesystem.New(ctx, fn.Output) 321 if err != nil { 322 return err 323 } 324 fn.fs = fs 325 fn.countdistro = beam.NewDistribution("wordcount", "countdistro") 326 return nil 327 } 328 329 func (fn *writeTempFiles) ProcessElement(ctx context.Context, k metakey, iter func(*string) bool, lookup func(string) func(*int) bool, emitFileName func(string)) error { 330 // Pull in and sort all the keys for this shard. 331 var v string 332 var words []string 333 for iter(&v) { 334 words = append(words, v) 335 } 336 sort.StringSlice(words).Sort() 337 338 tmpFile := k.TmpFileName(fn.Output) 339 wc, err := fn.fs.OpenWrite(ctx, tmpFile) 340 if err != nil { 341 return err 342 } 343 defer wc.Close() 344 for _, word := range words { 345 var count int 346 // Get the count for the word from the map side input. 347 lookup(word)(&count) 348 // Write the word and count to the file. 349 fmt.Fprintf(wc, "%v: %d\n", word, count) 350 // The count to a distribution for word counts. 351 fn.countdistro.Update(ctx, int64(count)) 352 } 353 emitFileName(tmpFile) 354 return nil 355 } 356 357 func (fn *writeTempFiles) FinishBundle(_ func(string) func(*int) bool, _ func(string)) { 358 fn.fs.Close() 359 fn.fs = nil 360 } 361 362 // renameFiles takes in files to rename as a side input so they can be moved/copied 363 // after successful file writes. Temporary files are removed as part of the rename. 364 // 365 // This implementation assumes temporary and final locations for files are on the 366 // same file system. 367 // 368 // A more robust implementation would move from the pipeline's temporary folder to 369 // the final output, or be able to move the files between different file systems. 370 type renameFiles struct { 371 Output string 372 373 fs filesystem.Interface 374 } 375 376 func (fn *renameFiles) StartBundle(ctx context.Context, _ func(*string) bool, _ func(string)) error { 377 fs, err := filesystem.New(ctx, fn.Output) 378 if err != nil { 379 return err 380 } 381 fn.fs = fs 382 return nil 383 } 384 385 func (fn *renameFiles) ProcessElement(ctx context.Context, k metakey, _ func(*string) bool, emit func(string)) error { 386 // We don't read the side input for the temporary files, but it's critical 387 // so that the rename step occurs only after all temporary files have been written. 388 tmp := k.TmpFileName(fn.Output) 389 final := k.FinalFileName(fn.Output) 390 log.Infof(ctx, "renaming %v to %v", tmp, final) 391 392 // Use the filesystem abstraction to perform the rename. 393 if err := filesystem.Rename(ctx, fn.fs, tmp, final); err != nil { 394 return err 395 } 396 397 // Rename's complete, so we emit the final file name, in case a downstream 398 // consumer wishes to block on their readiness. 399 emit(final) 400 return nil 401 } 402 403 func (fn *renameFiles) FinishBundle(ctx context.Context, _ func(*string) bool, _ func(string)) error { 404 fn.fs.Close() 405 fn.fs = nil 406 return nil 407 } 408 409 // pipeline builds and executes the pipeline, returning a PCollection of strings 410 // representing the output files. 411 func Pipeline(s beam.Scope, input, output string) beam.PCollection { 412 // Since this is the whole pipeline, we don't use a subscope here. 413 lines := textio.ReadSdf(s, input) 414 counted := CountWords(s, lines) 415 return SortAndShard(s, counted, output) 416 } 417 418 func main() { 419 flag.Parse() 420 beam.Init() 421 422 ctx := context.Background() 423 if *output == "" { 424 log.Exit(ctx, "No output provided") 425 } 426 427 p := beam.NewPipeline() 428 s := p.Root() 429 Pipeline(s, *input, *output) 430 431 if _, err := beamx.RunWithMetrics(ctx, p); err != nil { 432 log.Exitf(ctx, "Failed to execute job: %v", err) 433 } 434 }