github.com/apache/beam/sdks/v2@v2.48.2/go/test/integration/wordcount/wordcount.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // Package wordcount contains transforms for wordcount. 17 package wordcount 18 19 import ( 20 "context" 21 "fmt" 22 "regexp" 23 "strings" 24 25 "github.com/apache/beam/sdks/v2/go/pkg/beam" 26 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio" 27 "github.com/apache/beam/sdks/v2/go/pkg/beam/register" 28 "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/passert" 29 "github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats" 30 ) 31 32 var ( 33 wordRE = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`) 34 empty = beam.NewCounter("extract", "emptyLines") 35 lineLen = beam.NewDistribution("extract", "lineLenDistro") 36 smallWords = beam.NewCounter("extract", "smallWords") 37 ) 38 39 func init() { 40 register.Function3x0(extractFn) 41 register.Function2x1(formatFn) 42 43 register.Emitter1[string]() 44 } 45 46 // CountWords is a composite transform that counts the words of a PCollection 47 // of lines. It expects a PCollection of type string and returns a PCollection 48 // of type KV<string,int>. The Beam type checker enforces these constraints 49 // during pipeline construction. 50 func CountWords(s beam.Scope, lines beam.PCollection) beam.PCollection { 51 s = s.Scope("CountWords") 52 53 // Convert lines of text into individual words. 54 col := beam.ParDo(s, extractFn, lines) 55 56 // Count the number of times each word occurs. 57 return stats.Count(s, col) 58 } 59 60 // extractFn is a DoFn that emits the words in a given line. 61 func extractFn(ctx context.Context, line string, emit func(string)) { 62 lineLen.Update(ctx, int64(len(line))) 63 if len(strings.TrimSpace(line)) == 0 { 64 empty.Inc(ctx, 1) 65 } 66 for _, word := range wordRE.FindAllString(line, -1) { 67 if len(word) < 6 { 68 smallWords.Inc(ctx, 1) 69 } 70 emit(word) 71 } 72 } 73 74 // Format formats a KV of a word and its count as a string. 75 func Format(s beam.Scope, counted beam.PCollection) beam.PCollection { 76 return beam.ParDo(s, formatFn, counted) 77 } 78 79 func formatFn(w string, c int) string { 80 return fmt.Sprintf("%s: %v", w, c) 81 } 82 83 // WordCount returns a self-validating wordcount pipeline. 84 func WordCount(glob, hash string, size int) *beam.Pipeline { 85 p, s := beam.NewPipelineWithRoot() 86 87 in := textio.Read(s, glob) 88 WordCountFromPCol(s, in, hash, size) 89 return p 90 } 91 92 // WordCountFromPCol counts the words from a PCollection and validates it. 93 func WordCountFromPCol(s beam.Scope, in beam.PCollection, hash string, size int) { 94 out := Format(s, CountWords(s, in)) 95 passert.Hash(s, out, "out", hash, size) 96 }