github.com/apache/beam/sdks/v2@v2.48.2/go/examples/windowed_wordcount/windowed_wordcount.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // windowed_wordcount counts words in text, and can run over either unbounded 17 // or bounded input collections. 18 // 19 // This example is the last in a series of four successively more 20 // detailed 'word count' examples. First take a look at minimal_wordcount, 21 // wordcount, and debugging_wordcount. 22 // 23 // Basic concepts, also in the preceeding examples: Reading text files; 24 // counting a PCollection; writing to GCS; executing a Pipeline both locally 25 // and using a selected runner; defining DoFns; user-defined PTransforms; 26 // defining pipeline options. 27 // 28 // New Concepts: 29 // 30 // 1. Unbounded and bounded pipeline input modes 31 // 2. Adding timestamps to data 32 // 3. Windowing 33 // 4. Re-using PTransforms over windowed PCollections 34 // 5. Accessing the window of an element 35 package main 36 37 // beam-playground: 38 // name: WindowedWordCount 39 // description: An example that counts words in text, and can run over either unbounded or bounded input collections. 40 // multifile: false 41 // pipeline_options: --output output.txt 42 // context_line: 75 43 // categories: 44 // - Windowing 45 // - Options 46 // - Combiners 47 // - Quickstart 48 // complexity: ADVANCED 49 // tags: 50 // - count 51 // - stream 52 // - windowing 53 // - io 54 // - strings 55 56 import ( 57 "context" 58 "flag" 59 "fmt" 60 "math/rand" 61 "time" 62 63 "github.com/apache/beam/sdks/v2/go/pkg/beam" 64 "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/mtime" 65 "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window" 66 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio" 67 "github.com/apache/beam/sdks/v2/go/pkg/beam/log" 68 "github.com/apache/beam/sdks/v2/go/pkg/beam/register" 69 "github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx" 70 "github.com/apache/beam/sdks/v2/go/test/integration/wordcount" 71 ) 72 73 var ( 74 // By default, this example reads from a public dataset containing the text of 75 // King Lear. Set this option to choose a different input file or glob. 76 input = flag.String("input", "gs://apache-beam-samples/shakespeare/kinglear.txt", "File(s) to read.") 77 78 // Set this required option to specify where to write the output file. 79 output = flag.String("output", "", "Output (required).") 80 ) 81 82 func init() { 83 // register.DoFnXxY registers a struct DoFn so that it can be correctly serialized and does some optimization 84 // to avoid runtime reflection. Since addTimestampFn has 1 inputs and 2 outputs, we use register.DoFn1x2 and provide 85 // its input/output types as its constraints. 86 // Struct DoFns must be registered for a pipeline to run. 87 register.DoFn1x2[beam.X, beam.EventTime, beam.X](&addTimestampFn{}) 88 // For simple functional (non-struct) DoFns we can use register.FunctionXxY to perform the same registration without 89 // providing type constraints. 90 register.Function4x1(formatFn) 91 } 92 93 // Concept #2: A DoFn that sets the data element timestamp. This is a silly method, just for 94 // this example, for the bounded data case. 95 // 96 // Imagine that many ghosts of Shakespeare are all typing madly at the same time to recreate 97 // his masterworks. Each line of the corpus will get a random associated timestamp somewhere in a 98 // 2-hour period. 99 100 type addTimestampFn struct { 101 Min beam.EventTime `json:"min"` 102 } 103 104 func (f *addTimestampFn) ProcessElement(x beam.X) (beam.EventTime, beam.X) { 105 timestamp := f.Min.Add(time.Duration(rand.Int63n(2 * time.Hour.Nanoseconds()))) 106 return timestamp, x 107 } 108 109 // Concept #5: formatFn accesses the window of each element. 110 111 // formatFn is a DoFn that formats a windowed word and its count as a string. 112 func formatFn(iw beam.Window, et beam.EventTime, w string, c int) string { 113 s := fmt.Sprintf("%v@%v %s: %v", et, iw, w, c) 114 return s 115 } 116 117 func main() { 118 flag.Parse() 119 beam.Init() 120 121 ctx := context.Background() 122 123 if *output == "" { 124 log.Exit(ctx, "No --output provided") 125 } 126 127 p := beam.NewPipeline() 128 s := p.Root() 129 130 // Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or 131 // unbounded input source. 132 lines := textio.Read(s, *input) 133 134 // Concept #2: Add an element timestamp, using an artificial time just to show windowing. 135 timestampedLines := beam.ParDo(s, &addTimestampFn{Min: mtime.Now()}, lines) 136 137 // Concept #3: WindowingStrategy into fixed windows. The fixed window size for this example is 1 138 // minute. See the documentation for more information on how fixed windows work, and 139 // for information on the other types of windowing available (e.g., sliding windows). 140 windowedLines := beam.WindowInto(s, window.NewFixedWindows(time.Minute), timestampedLines) 141 142 // Concept #4: Re-use our existing CountWords transform that does not have knowledge of 143 // windows over a PCollection containing windowed values. 144 counted := wordcount.CountWords(s, windowedLines) 145 146 // TODO(herohde) 4/16/2018: textio.Write does not support windowed writes, so we 147 // simply include the window in the output and re-window back into the global window 148 // before the write. 149 150 formatted := beam.ParDo(s, formatFn, counted) 151 merged := beam.WindowInto(s, window.NewGlobalWindows(), formatted) 152 textio.Write(s, *output, merged) 153 154 if err := beamx.Run(context.Background(), p); err != nil { 155 log.Exitf(ctx, "Failed to execute job: %v", err) 156 } 157 }