github.com/apache/beam/sdks/v2@v2.48.2/go/examples/windowed_wordcount/windowed_wordcount.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one or more
     2  // contributor license agreements.  See the NOTICE file distributed with
     3  // this work for additional information regarding copyright ownership.
     4  // The ASF licenses this file to You under the Apache License, Version 2.0
     5  // (the "License"); you may not use this file except in compliance with
     6  // the License.  You may obtain a copy of the License at
     7  //
     8  //    http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  
    16  // windowed_wordcount counts words in text, and can run over either unbounded
    17  // or bounded input collections.
    18  //
    19  // This example is the last in a series of four successively more
    20  // detailed 'word count' examples. First take a look at minimal_wordcount,
    21  // wordcount, and debugging_wordcount.
    22  //
    23  // Basic concepts, also in the preceeding examples: Reading text files;
    24  // counting a PCollection; writing to GCS; executing a Pipeline both locally
    25  // and using a selected runner; defining DoFns; user-defined PTransforms;
    26  // defining pipeline options.
    27  //
    28  // New Concepts:
    29  //
    30  //  1. Unbounded and bounded pipeline input modes
    31  //  2. Adding timestamps to data
    32  //  3. Windowing
    33  //  4. Re-using PTransforms over windowed PCollections
    34  //  5. Accessing the window of an element
    35  package main
    36  
    37  // beam-playground:
    38  //   name: WindowedWordCount
    39  //   description: An example that counts words in text, and can run over either unbounded or bounded input collections.
    40  //   multifile: false
    41  //   pipeline_options: --output output.txt
    42  //   context_line: 75
    43  //   categories:
    44  //     - Windowing
    45  //     - Options
    46  //     - Combiners
    47  //     - Quickstart
    48  //   complexity: ADVANCED
    49  //   tags:
    50  //     - count
    51  //     - stream
    52  //     - windowing
    53  //     - io
    54  //     - strings
    55  
    56  import (
    57  	"context"
    58  	"flag"
    59  	"fmt"
    60  	"math/rand"
    61  	"time"
    62  
    63  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
    64  	"github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/mtime"
    65  	"github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window"
    66  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio"
    67  	"github.com/apache/beam/sdks/v2/go/pkg/beam/log"
    68  	"github.com/apache/beam/sdks/v2/go/pkg/beam/register"
    69  	"github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx"
    70  	"github.com/apache/beam/sdks/v2/go/test/integration/wordcount"
    71  )
    72  
    73  var (
    74  	// By default, this example reads from a public dataset containing the text of
    75  	// King Lear. Set this option to choose a different input file or glob.
    76  	input = flag.String("input", "gs://apache-beam-samples/shakespeare/kinglear.txt", "File(s) to read.")
    77  
    78  	// Set this required option to specify where to write the output file.
    79  	output = flag.String("output", "", "Output (required).")
    80  )
    81  
    82  func init() {
    83  	// register.DoFnXxY registers a struct DoFn so that it can be correctly serialized and does some optimization
    84  	// to avoid runtime reflection. Since addTimestampFn has 1 inputs and 2 outputs, we use register.DoFn1x2 and provide
    85  	// its input/output types as its constraints.
    86  	// Struct DoFns must be registered for a pipeline to run.
    87  	register.DoFn1x2[beam.X, beam.EventTime, beam.X](&addTimestampFn{})
    88  	// For simple functional (non-struct) DoFns we can use register.FunctionXxY to perform the same registration without
    89  	// providing type constraints.
    90  	register.Function4x1(formatFn)
    91  }
    92  
    93  // Concept #2: A DoFn that sets the data element timestamp. This is a silly method, just for
    94  // this example, for the bounded data case.
    95  //
    96  // Imagine that many ghosts of Shakespeare are all typing madly at the same time to recreate
    97  // his masterworks. Each line of the corpus will get a random associated timestamp somewhere in a
    98  // 2-hour period.
    99  
   100  type addTimestampFn struct {
   101  	Min beam.EventTime `json:"min"`
   102  }
   103  
   104  func (f *addTimestampFn) ProcessElement(x beam.X) (beam.EventTime, beam.X) {
   105  	timestamp := f.Min.Add(time.Duration(rand.Int63n(2 * time.Hour.Nanoseconds())))
   106  	return timestamp, x
   107  }
   108  
   109  // Concept #5: formatFn accesses the window of each element.
   110  
   111  // formatFn is a DoFn that formats a windowed word and its count as a string.
   112  func formatFn(iw beam.Window, et beam.EventTime, w string, c int) string {
   113  	s := fmt.Sprintf("%v@%v %s: %v", et, iw, w, c)
   114  	return s
   115  }
   116  
   117  func main() {
   118  	flag.Parse()
   119  	beam.Init()
   120  
   121  	ctx := context.Background()
   122  
   123  	if *output == "" {
   124  		log.Exit(ctx, "No --output provided")
   125  	}
   126  
   127  	p := beam.NewPipeline()
   128  	s := p.Root()
   129  
   130  	// Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or
   131  	// unbounded input source.
   132  	lines := textio.Read(s, *input)
   133  
   134  	// Concept #2: Add an element timestamp, using an artificial time just to show windowing.
   135  	timestampedLines := beam.ParDo(s, &addTimestampFn{Min: mtime.Now()}, lines)
   136  
   137  	// Concept #3: WindowingStrategy into fixed windows. The fixed window size for this example is 1
   138  	// minute. See the documentation for more information on how fixed windows work, and
   139  	// for information on the other types of windowing available (e.g., sliding windows).
   140  	windowedLines := beam.WindowInto(s, window.NewFixedWindows(time.Minute), timestampedLines)
   141  
   142  	// Concept #4: Re-use our existing CountWords transform that does not have knowledge of
   143  	// windows over a PCollection containing windowed values.
   144  	counted := wordcount.CountWords(s, windowedLines)
   145  
   146  	// TODO(herohde) 4/16/2018: textio.Write does not support windowed writes, so we
   147  	// simply include the window in the output and re-window back into the global window
   148  	// before the write.
   149  
   150  	formatted := beam.ParDo(s, formatFn, counted)
   151  	merged := beam.WindowInto(s, window.NewGlobalWindows(), formatted)
   152  	textio.Write(s, *output, merged)
   153  
   154  	if err := beamx.Run(context.Background(), p); err != nil {
   155  		log.Exitf(ctx, "Failed to execute job: %v", err)
   156  	}
   157  }