github.com/apache/beam/sdks/v2@v2.48.2/go/examples/contains/contains.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one or more 2 // contributor license agreements. See the NOTICE file distributed with 3 // this work for additional information regarding copyright ownership. 4 // The ASF licenses this file to You under the Apache License, Version 2.0 5 // (the "License"); you may not use this file except in compliance with 6 // the License. You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 package main 17 18 // beam-playground: 19 // name: Contains 20 // description: An example counts received substring in Shakespeare's works. 21 // multifile: false 22 // pipeline_options: --search king 23 // context_line: 51 24 // categories: 25 // - Filtering 26 // - Options 27 // - Debugging 28 // complexity: MEDIUM 29 // tags: 30 // - count 31 // - io 32 // - strings 33 34 import ( 35 "context" 36 "flag" 37 "fmt" 38 "regexp" 39 "strings" 40 41 "github.com/apache/beam/sdks/v2/go/pkg/beam" 42 "github.com/apache/beam/sdks/v2/go/pkg/beam/io/textio" 43 "github.com/apache/beam/sdks/v2/go/pkg/beam/log" 44 "github.com/apache/beam/sdks/v2/go/pkg/beam/register" 45 "github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/stats" 46 "github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx" 47 "github.com/apache/beam/sdks/v2/go/pkg/beam/x/debug" 48 ) 49 50 // Options used purely at pipeline construction-time can just be flags. 51 var ( 52 input = flag.String("input", "gs://apache-beam-samples/shakespeare/kinglear.txt", "File(s) to read.") 53 search = flag.String("search", "", "Only return words that contain this substring.") 54 ) 55 56 func init() { 57 register.Function2x0(extractFn) 58 register.Function2x1(formatFn) 59 register.DoFn2x0[string, func(string)](&includeFn{}) 60 register.Emitter1[string]() 61 } 62 63 // FilterWords returns PCollection<KV<word,count>> with (up to) 10 matching words. 64 func FilterWords(s beam.Scope, lines beam.PCollection) beam.PCollection { 65 s = s.Scope("FilterWords") 66 words := beam.ParDo(s, extractFn, lines) 67 filtered := beam.ParDo(s, &includeFn{Search: *search}, words) 68 counted := stats.Count(s, filtered) 69 return debug.Head(s, counted, 10) 70 } 71 72 var wordRE = regexp.MustCompile(`[a-zA-Z]+('[a-z])?`) 73 74 func extractFn(line string, emit func(string)) { 75 for _, w := range wordRE.FindAllString(line, -1) { 76 emit(w) 77 } 78 } 79 80 // includeFn outputs (word) iif the word contains substring Search. 81 type includeFn struct { 82 Search string `json:"search"` 83 } 84 85 func (f *includeFn) ProcessElement(s string, emit func(string)) { 86 if strings.Contains(s, f.Search) { 87 emit(s) 88 } 89 } 90 91 func formatFn(w string, c int) string { 92 return fmt.Sprintf("%s: %v", w, c) 93 } 94 95 func main() { 96 flag.Parse() 97 beam.Init() 98 99 ctx := context.Background() 100 101 if *search == "" { 102 log.Exit(ctx, "No search string provided. Use --search=foo") 103 } 104 105 log.Info(ctx, "Running contains") 106 107 // Construct a pipeline that only keeps 10 words that contain the provided search string. 108 p := beam.NewPipeline() 109 s := p.Root() 110 lines := textio.Read(s, *input) 111 filtered := FilterWords(s, lines) 112 formatted := beam.ParDo(s, formatFn, filtered) 113 debug.Print(s, formatted) 114 115 if err := beamx.Run(ctx, p); err != nil { 116 log.Exitf(ctx, "Failed to execute job: %v", err) 117 } 118 }