github.com/apache/beam/sdks/v2@v2.48.2/go/examples/large_wordcount/large_wordcount.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one or more
     2  // contributor license agreements.  See the NOTICE file distributed with
     3  // this work for additional information regarding copyright ownership.
     4  // The ASF licenses this file to You under the Apache License, Version 2.0
     5  // (the "License"); you may not use this file except in compliance with
     6  // the License.  You may obtain a copy of the License at
     7  //
     8  //    http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  
    16  // large_wordcount is an example that demonstrates a more complex version
    17  // of a wordcount pipeline. It uses a SplittableDoFn for reading the
    18  // text files, then uses a map side input to build sorted shards.
    19  //
    20  // This example, large_wordcount, is the fourth in a series of five
    21  // successively more detailed 'word count' examples. You may first want to
    22  // take a look at minimal_wordcount and wordcount.
    23  // Then look at debugging_worcount for some testing and validation concepts.
    24  // After you've looked at this example, follow up with the windowed_wordcount
    25  // pipeline, for introduction of additional concepts.
    26  //
    27  // Basic concepts, also in the minimal_wordcount and wordcount examples:
    28  // Reading text files; counting a PCollection; executing a Pipeline both locally
    29  // and using a selected runner; defining DoFns.
    30  //
    31  // New Concepts:
    32  //
    33  //  1. Using a SplittableDoFn transform to read the IOs.
    34  //  2. Using a Map Side Input to access values for specific keys.
    35  //  3. Testing your Pipeline via passert and metrics, using Go testing tools.
    36  //
    37  // This example will not be enumerating concepts, but will document them as they
    38  // appear. There may be repetition from previous examples.
    39  //
    40  // To change the runner, specify:
    41  //
    42  //	--runner=YOUR_SELECTED_RUNNER
    43  //
    44  // The input file defaults to a public data set containing the text of King
    45  // Lear, by William Shakespeare. You can override it and choose your own input
    46  // with --input.
    47  package main
    48  
    49  import (
    50  	"context"
    51  	"flag"
    52  	"fmt"
    53  	"regexp"
    54  	"sort"
    55  	"strconv"
    56  	"time"
    57  
    58  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
    59  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/filesystem"
    60  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/rtrackers/offsetrange"
    61  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio"
    62  	"github.com/apache/beam/sdks/v2/go/pkg/beam/log"
    63  	"github.com/apache/beam/sdks/v2/go/pkg/beam/register"
    64  	"github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats"
    65  
    66  	"github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx"
    67  
    68  	_ "github.com/apache/beam/sdks/v2/go/pkg/beam/io/filesystem/gcs"
    69  	_ "github.com/apache/beam/sdks/v2/go/pkg/beam/io/filesystem/local"
    70  
    71  	// The imports here are for the side effect of runner registration.
    72  	_ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/dataflow"
    73  	_ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/direct"
    74  	_ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/dot"
    75  	_ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/flink"
    76  	_ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/samza"
    77  	_ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/spark"
    78  	_ "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/universal"
    79  )
    80  
    81  var (
    82  	input  = flag.String("input", "gs://apache-beam-samples/shakespeare/*.txt", "File(s) to read.")
    83  	output = flag.String("output", "", "Output file (required). Use @* or @N (eg. @5) to indicate dynamic, or fixed number of shards. No shard indicator means a single file.")
    84  )
    85  
    86  // Concept: DoFn and Type Registration
    87  // All DoFns and user types used as PCollection elements must be registered with beam.
    88  
    89  func init() {
    90  	register.Function2x0(extractFn)
    91  	register.Function2x1(formatFn)
    92  	register.DoFn4x1[context.Context, []byte, func(*string) bool, func(metakey), error](&makeMetakeys{})
    93  
    94  	register.DoFn4x0[context.Context, string, func(*metakey) bool, func(metakey, string)](&pairWithMetakey{})
    95  	register.DoFn5x1[context.Context, metakey, func(*string) bool, func(string) func(*int) bool, func(string), error](&writeTempFiles{})
    96  	register.DoFn4x1[context.Context, metakey, func(*string) bool, func(string), error](&renameFiles{})
    97  
    98  	register.Emitter1[metakey]()
    99  	register.Emitter2[metakey, string]()
   100  	register.Iter1[*string]()
   101  	register.Iter1[*metakey]()
   102  }
   103  
   104  // The below transforms are identical to the wordcount versions. If this was
   105  // production code, common transforms would be placed in a separate package
   106  // and shared directly rather than being copied.
   107  
   108  var wordRE = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`)
   109  
   110  // extractFn is a DoFn that emits the words in a given line.
   111  func extractFn(line string, emit func(string)) {
   112  	for _, word := range wordRE.FindAllString(line, -1) {
   113  		emit(word)
   114  	}
   115  }
   116  
   117  // formatFn is a DoFn that formats a word and its count as a string.
   118  func formatFn(w string, c int) string {
   119  	return fmt.Sprintf("%s: %v", w, c)
   120  }
   121  
   122  // CountWords is a composite transform that counts the words of an PCollection
   123  // of lines. It expects a PCollection of type string and returns a PCollection
   124  // of type KV<string,int>.
   125  func CountWords(s beam.Scope, lines beam.PCollection) beam.PCollection {
   126  	s = s.Scope("CountWords")
   127  	col := beam.ParDo(s, extractFn, lines)
   128  	return stats.Count(s, col)
   129  }
   130  
   131  // SortAndShard is defined earlier in the file, so it can provide an overview of this
   132  // complex segment of pipeline. The DoFns that build it up follow.
   133  
   134  // SortAndShard is a composite transform takes in a PCollection<string,int>
   135  // and an output pattern. It returns a PCollection<string> with the output file paths.
   136  // It demonstrates using a side input, a map side input and producing output.
   137  func SortAndShard(s beam.Scope, in beam.PCollection, output string) beam.PCollection {
   138  	s = s.Scope("SortAndShard")
   139  	// For the sake of example, we drop the values from the keys.
   140  	keys := beam.DropValue(s, in)
   141  
   142  	// Concept: Impulse and Side Input to process on a single worker.
   143  	// makeMetakeys is being started with an Impulse, and blocked from starting
   144  	// until it's side input is ready. This will have all the work done for this
   145  	// DoFn executed in a single bundle, on a single worker. This requires that
   146  	// the values fit into memory for a single worker.
   147  
   148  	// makeMetakeys divides the data into several shards as determined by the output pattern.
   149  	// One metakey is produced per shard.
   150  	metakeys := beam.ParDo(s, &makeMetakeys{Output: output}, beam.Impulse(s), beam.SideInput{Input: keys})
   151  
   152  	// Takes the metakeys, and pairs each key with it's metakey.
   153  	rekeys := beam.ParDo(s, &pairWithMetakey{}, keys, beam.SideInput{Input: metakeys})
   154  
   155  	// Group all the newly paired values with their metakeys.
   156  	// This forms the individual shards we will write to files.
   157  	gbmeta := beam.GroupByKey(s, rekeys)
   158  
   159  	// writeTempFiles produces temporary output files with the metakey.
   160  	// Counts for each word are looked up in the map side input of the
   161  	// original word + count pairs.
   162  	tmpFiles := beam.ParDo(s, &writeTempFiles{Output: output}, gbmeta, beam.SideInput{Input: in})
   163  
   164  	// renameFiles takes the tmp files, and renames them to the final destination.
   165  	// Using temporary names and then renaming is recommended to avoid conflicts on retries,
   166  	// if the original files fail to write.
   167  	return beam.ParDo(s, &renameFiles{Output: output}, metakeys, beam.SideInput{Input: tmpFiles})
   168  }
   169  
   170  // metakey serves the purpose of being a key for splitting up input
   171  // into distinct shards.
   172  type metakey struct {
   173  	Low, High    string
   174  	Shard, Total int
   175  	TmpInfix     int64
   176  }
   177  
   178  // outputRE is a regular expression representing the shard indicator: @* or @<shard count>
   179  var outputRE = regexp.MustCompile(`(@\*|@\d+)`)
   180  
   181  // makeTmpInfix converts a unix time into a compact string representation.
   182  func makeTmpInfix(v int64) string {
   183  	return strconv.FormatInt(v, 36)
   184  }
   185  
   186  // TmpFileName produces a temporary filename for this meta key, including an infix to
   187  // group temporary files from the same run together.
   188  func (m *metakey) TmpFileName(output string) string {
   189  	shard := fmt.Sprintf("%03d-%03d.%s", m.Shard, m.Total, makeTmpInfix(m.TmpInfix))
   190  	return outputRE.ReplaceAllString(output, shard)
   191  }
   192  
   193  // FinalFileName produces the final file name for this shard.
   194  func (m *metakey) FinalFileName(output string) string {
   195  	shard := fmt.Sprintf("%03d-%03d", m.Shard, m.Total)
   196  	return outputRE.ReplaceAllString(output, shard)
   197  }
   198  
   199  // makeMetakeys produces metakeys for each shard.
   200  type makeMetakeys struct {
   201  	Output  string // The format of output files.
   202  	Dynamic int    // The number of elements for each dynamic shard. Default 10k. Ignored if the format doesn't contain `@*`.
   203  
   204  	keycount, metakeycount beam.Counter
   205  }
   206  
   207  func (fn *makeMetakeys) StartBundle(_ func(*string) bool, _ func(metakey)) {
   208  	if fn.Dynamic <= 0 {
   209  		fn.Dynamic = 10000
   210  	}
   211  	fn.keycount = beam.NewCounter("wordcount", "keycount")
   212  	fn.metakeycount = beam.NewCounter("metakeys", "metakeycount")
   213  }
   214  
   215  func (fn *makeMetakeys) ProcessElement(ctx context.Context, _ []byte, iter func(*string) bool, emit func(metakey)) error {
   216  	// Pull in and sort all the keys in memory.
   217  	var v string
   218  	var keys []string
   219  	for iter(&v) {
   220  		keys = append(keys, v)
   221  	}
   222  	sort.StringSlice(keys).Sort()
   223  
   224  	// Increment for all the keys at once.
   225  	fn.keycount.Inc(ctx, int64(len(keys)))
   226  
   227  	// Code within DoFns can be arbitrarily complex,
   228  	// and executes as ordinary code would.
   229  
   230  	// first, parse fn.Output for a shard.
   231  	match := outputRE.FindString(fn.Output)
   232  	r := offsetrange.Restriction{Start: 0, End: int64(len(keys)) - 1}
   233  	var rs []offsetrange.Restriction
   234  	switch match {
   235  	case "": // No matches
   236  		// Everything in a single file.
   237  		rs = append(rs, r)
   238  	case "@*": // Dynamic Sharding
   239  		// Basic dynamic sharding, where each file will contain a fixed number of words.
   240  		rs = r.SizedSplits(int64(fn.Dynamic))
   241  	default: // @N Fixed Sharding
   242  		// Fixed number of shards, where each shard will contain 1/Nth of the words.
   243  		n, err := strconv.Atoi(match[1:])
   244  		if err != nil {
   245  			return fmt.Errorf("bad output format: Unable to extract shard count from %v: %v", fn.Output, err)
   246  		}
   247  		rs = r.EvenSplits(int64(n))
   248  	}
   249  	// Increment the number of expected shards.
   250  	fn.metakeycount.Inc(ctx, int64(len(rs)))
   251  
   252  	// Use the current time in unix as the temp infix.
   253  	// Since it's included with all metakeys, an int64 is preferable to a string for compactness.
   254  	tmpInfix := time.Now().Unix()
   255  
   256  	// Log the identifier to assist with debugging.
   257  	log.Infof(ctx, "makeMetakeys: temp file identifier %s used for output path %s", makeTmpInfix(tmpInfix), fn.Output)
   258  	for s, ri := range rs {
   259  		emit(metakey{
   260  			Low:      keys[int(ri.Start)],
   261  			High:     keys[int(ri.End)],
   262  			Shard:    s,
   263  			Total:    len(rs),
   264  			TmpInfix: tmpInfix,
   265  		})
   266  	}
   267  	return nil
   268  }
   269  
   270  // pairWithMetakey processes each element, and re-emits them with the metakey.
   271  // This associates them with each shard of the final output.
   272  type pairWithMetakey struct {
   273  	mks []metakey
   274  }
   275  
   276  func (fn *pairWithMetakey) ProcessElement(ctx context.Context, v string, iter func(*metakey) bool, emit func(metakey, string)) {
   277  	// Read in all the metakeys and sort on the first element.
   278  	// Since this pipeline runs with the global window, the side input
   279  	// will not change, so it can be cached in the DoFn.
   280  	// This will only happen once per bundle.
   281  	if fn.mks == nil {
   282  		var mk metakey
   283  		for iter(&mk) {
   284  			fn.mks = append(fn.mks, mk)
   285  		}
   286  		sort.Slice(fn.mks, func(i, j int) bool {
   287  			return fn.mks[i].Shard < fn.mks[j].Shard
   288  		})
   289  	}
   290  
   291  	n := len(fn.mks)
   292  	i := sort.Search(n, func(i int) bool {
   293  		return v <= fn.mks[i].High
   294  	})
   295  
   296  	emit(fn.mks[i], v)
   297  }
   298  
   299  func (fn *pairWithMetakey) FinishBundle(_ func(*metakey) bool, _ func(metakey, string)) {
   300  	fn.mks = nil // allow the metakeys to be garbage collected when the bundle is finished.
   301  }
   302  
   303  // writeTempFiles takes each metakey and it's grouped words (the original keys), and uses
   304  // a map side input to lookup the original sum for each word.
   305  //
   306  // All words for the metakey are sorted in memory and written to a temporary file, outputing
   307  // the temporary file name. Each metakey includes a temporary infix used to distinguish
   308  // a given attempt's set of files from each other, and the final successful files.
   309  //
   310  // A more robust implementation would write to the pipeline's temporary folder instead,
   311  // but for this example, using the same output destination is sufficient.
   312  type writeTempFiles struct {
   313  	Output string
   314  
   315  	fs          filesystem.Interface
   316  	countdistro beam.Distribution
   317  }
   318  
   319  func (fn *writeTempFiles) StartBundle(ctx context.Context, _ func(string) func(*int) bool, _ func(string)) error {
   320  	fs, err := filesystem.New(ctx, fn.Output)
   321  	if err != nil {
   322  		return err
   323  	}
   324  	fn.fs = fs
   325  	fn.countdistro = beam.NewDistribution("wordcount", "countdistro")
   326  	return nil
   327  }
   328  
   329  func (fn *writeTempFiles) ProcessElement(ctx context.Context, k metakey, iter func(*string) bool, lookup func(string) func(*int) bool, emitFileName func(string)) error {
   330  	// Pull in and sort all the keys for this shard.
   331  	var v string
   332  	var words []string
   333  	for iter(&v) {
   334  		words = append(words, v)
   335  	}
   336  	sort.StringSlice(words).Sort()
   337  
   338  	tmpFile := k.TmpFileName(fn.Output)
   339  	wc, err := fn.fs.OpenWrite(ctx, tmpFile)
   340  	if err != nil {
   341  		return err
   342  	}
   343  	defer wc.Close()
   344  	for _, word := range words {
   345  		var count int
   346  		// Get the count for the word from the map side input.
   347  		lookup(word)(&count)
   348  		// Write the word and count to the file.
   349  		fmt.Fprintf(wc, "%v: %d\n", word, count)
   350  		// The count to a distribution for word counts.
   351  		fn.countdistro.Update(ctx, int64(count))
   352  	}
   353  	emitFileName(tmpFile)
   354  	return nil
   355  }
   356  
   357  func (fn *writeTempFiles) FinishBundle(_ func(string) func(*int) bool, _ func(string)) {
   358  	fn.fs.Close()
   359  	fn.fs = nil
   360  }
   361  
   362  // renameFiles takes in files to rename as a side input so they can be moved/copied
   363  // after successful file writes. Temporary files are removed as part of the rename.
   364  //
   365  // This implementation assumes temporary and final locations for files are on the
   366  // same file system.
   367  //
   368  // A more robust implementation would move from the pipeline's temporary folder to
   369  // the final output, or be able to move the files between different file systems.
   370  type renameFiles struct {
   371  	Output string
   372  
   373  	fs filesystem.Interface
   374  }
   375  
   376  func (fn *renameFiles) StartBundle(ctx context.Context, _ func(*string) bool, _ func(string)) error {
   377  	fs, err := filesystem.New(ctx, fn.Output)
   378  	if err != nil {
   379  		return err
   380  	}
   381  	fn.fs = fs
   382  	return nil
   383  }
   384  
   385  func (fn *renameFiles) ProcessElement(ctx context.Context, k metakey, _ func(*string) bool, emit func(string)) error {
   386  	// We don't read the side input for the temporary files, but it's critical
   387  	// so that the rename step occurs only after all temporary files have been written.
   388  	tmp := k.TmpFileName(fn.Output)
   389  	final := k.FinalFileName(fn.Output)
   390  	log.Infof(ctx, "renaming %v to %v", tmp, final)
   391  
   392  	// Use the filesystem abstraction to perform the rename.
   393  	if err := filesystem.Rename(ctx, fn.fs, tmp, final); err != nil {
   394  		return err
   395  	}
   396  
   397  	// Rename's complete, so we emit the final file name, in case a downstream
   398  	// consumer wishes to block on their readiness.
   399  	emit(final)
   400  	return nil
   401  }
   402  
   403  func (fn *renameFiles) FinishBundle(ctx context.Context, _ func(*string) bool, _ func(string)) error {
   404  	fn.fs.Close()
   405  	fn.fs = nil
   406  	return nil
   407  }
   408  
   409  // pipeline builds and executes the pipeline, returning a PCollection of strings
   410  // representing the output files.
   411  func Pipeline(s beam.Scope, input, output string) beam.PCollection {
   412  	// Since this is the whole pipeline, we don't use a subscope here.
   413  	lines := textio.ReadSdf(s, input)
   414  	counted := CountWords(s, lines)
   415  	return SortAndShard(s, counted, output)
   416  }
   417  
   418  func main() {
   419  	flag.Parse()
   420  	beam.Init()
   421  
   422  	ctx := context.Background()
   423  	if *output == "" {
   424  		log.Exit(ctx, "No output provided")
   425  	}
   426  
   427  	p := beam.NewPipeline()
   428  	s := p.Root()
   429  	Pipeline(s, *input, *output)
   430  
   431  	if _, err := beamx.RunWithMetrics(ctx, p); err != nil {
   432  		log.Exitf(ctx, "Failed to execute job: %v", err)
   433  	}
   434  }