github.com/apache/beam/sdks/v2@v2.48.2/go/test/integration/wordcount/wordcount.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one or more
     2  // contributor license agreements.  See the NOTICE file distributed with
     3  // this work for additional information regarding copyright ownership.
     4  // The ASF licenses this file to You under the Apache License, Version 2.0
     5  // (the "License"); you may not use this file except in compliance with
     6  // the License.  You may obtain a copy of the License at
     7  //
     8  //    http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  
    16  // Package wordcount contains transforms for wordcount.
    17  package wordcount
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"regexp"
    23  	"strings"
    24  
    25  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
    26  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio"
    27  	"github.com/apache/beam/sdks/v2/go/pkg/beam/register"
    28  	"github.com/apache/beam/sdks/v2/go/pkg/beam/testing/passert"
    29  	"github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats"
    30  )
    31  
    32  var (
    33  	wordRE     = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`)
    34  	empty      = beam.NewCounter("extract", "emptyLines")
    35  	lineLen    = beam.NewDistribution("extract", "lineLenDistro")
    36  	smallWords = beam.NewCounter("extract", "smallWords")
    37  )
    38  
    39  func init() {
    40  	register.Function3x0(extractFn)
    41  	register.Function2x1(formatFn)
    42  
    43  	register.Emitter1[string]()
    44  }
    45  
    46  // CountWords is a composite transform that counts the words of a PCollection
    47  // of lines. It expects a PCollection of type string and returns a PCollection
    48  // of type KV<string,int>. The Beam type checker enforces these constraints
    49  // during pipeline construction.
    50  func CountWords(s beam.Scope, lines beam.PCollection) beam.PCollection {
    51  	s = s.Scope("CountWords")
    52  
    53  	// Convert lines of text into individual words.
    54  	col := beam.ParDo(s, extractFn, lines)
    55  
    56  	// Count the number of times each word occurs.
    57  	return stats.Count(s, col)
    58  }
    59  
    60  // extractFn is a DoFn that emits the words in a given line.
    61  func extractFn(ctx context.Context, line string, emit func(string)) {
    62  	lineLen.Update(ctx, int64(len(line)))
    63  	if len(strings.TrimSpace(line)) == 0 {
    64  		empty.Inc(ctx, 1)
    65  	}
    66  	for _, word := range wordRE.FindAllString(line, -1) {
    67  		if len(word) < 6 {
    68  			smallWords.Inc(ctx, 1)
    69  		}
    70  		emit(word)
    71  	}
    72  }
    73  
    74  // Format formats a KV of a word and its count as a string.
    75  func Format(s beam.Scope, counted beam.PCollection) beam.PCollection {
    76  	return beam.ParDo(s, formatFn, counted)
    77  }
    78  
    79  func formatFn(w string, c int) string {
    80  	return fmt.Sprintf("%s: %v", w, c)
    81  }
    82  
    83  // WordCount returns a self-validating wordcount pipeline.
    84  func WordCount(glob, hash string, size int) *beam.Pipeline {
    85  	p, s := beam.NewPipelineWithRoot()
    86  
    87  	in := textio.Read(s, glob)
    88  	WordCountFromPCol(s, in, hash, size)
    89  	return p
    90  }
    91  
    92  // WordCountFromPCol counts the words from a PCollection and validates it.
    93  func WordCountFromPCol(s beam.Scope, in beam.PCollection, hash string, size int) {
    94  	out := Format(s, CountWords(s, in))
    95  	passert.Hash(s, out, "out", hash, size)
    96  }