github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/wordcount_with_metrics.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A word-counting workflow.""" 19 20 # pytype: skip-file 21 22 # beam-playground: 23 # name: WordCountWithMetrics 24 # description: A word-counting workflow with metrics. 25 # multifile: false 26 # default_example: true 27 # pipeline_options: --output output.txt 28 # context_line: 48 29 # categories: 30 # - Combiners 31 # - Options 32 # - Metrics 33 # - Quickstart 34 # complexity: MEDIUM 35 # tags: 36 # - count 37 # - metrics 38 # - strings 39 40 import argparse 41 import logging 42 import re 43 44 import apache_beam as beam 45 from apache_beam.io import ReadFromText 46 from apache_beam.io import WriteToText 47 from apache_beam.metrics import Metrics 48 from apache_beam.metrics.metric import MetricsFilter 49 from apache_beam.options.pipeline_options import PipelineOptions 50 from apache_beam.options.pipeline_options import SetupOptions 51 52 53 class WordExtractingDoFn(beam.DoFn): 54 """Parse each line of input text into words.""" 55 def __init__(self): 56 # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3. 57 # super().__init__() 58 beam.DoFn.__init__(self) 59 self.words_counter = Metrics.counter(self.__class__, 'words') 60 self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths') 61 self.word_lengths_dist = Metrics.distribution( 62 self.__class__, 'word_len_dist') 63 self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines') 64 65 def process(self, element): 66 """Returns an iterator over the words of this element. 67 68 The element is a line of text. If the line is blank, note that, too. 69 70 Args: 71 element: the element being processed 72 73 Returns: 74 The processed element. 75 """ 76 text_line = element.strip() 77 if not text_line: 78 self.empty_line_counter.inc(1) 79 words = re.findall(r'[\w\']+', text_line, re.UNICODE) 80 for w in words: 81 self.words_counter.inc() 82 self.word_lengths_counter.inc(len(w)) 83 self.word_lengths_dist.update(len(w)) 84 return words 85 86 87 def main(argv=None, save_main_session=True): 88 """Main entry point; defines and runs the wordcount pipeline.""" 89 parser = argparse.ArgumentParser() 90 parser.add_argument( 91 '--input', 92 dest='input', 93 default='gs://dataflow-samples/shakespeare/kinglear.txt', 94 help='Input file to process.') 95 parser.add_argument( 96 '--output', 97 dest='output', 98 required=True, 99 help='Output file to write results to.') 100 known_args, pipeline_args = parser.parse_known_args(argv) 101 102 # We use the save_main_session option because one or more DoFn's in this 103 # workflow rely on global context (e.g., a module imported at module level). 104 pipeline_options = PipelineOptions(pipeline_args) 105 pipeline_options.view_as(SetupOptions).save_main_session = save_main_session 106 p = beam.Pipeline(options=pipeline_options) 107 108 # Read the text file[pattern] into a PCollection. 109 lines = p | 'read' >> ReadFromText(known_args.input) 110 111 # Count the occurrences of each word. 112 def count_ones(word_ones): 113 (word, ones) = word_ones 114 return (word, sum(ones)) 115 116 counts = ( 117 lines 118 | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str)) 119 | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) 120 | 'group' >> beam.GroupByKey() 121 | 'count' >> beam.Map(count_ones)) 122 123 # Format the counts into a PCollection of strings. 124 def format_result(word_count): 125 (word, count) = word_count 126 return '%s: %d' % (word, count) 127 128 output = counts | 'format' >> beam.Map(format_result) 129 130 # Write the output using a "Write" transform that has side effects. 131 # pylint: disable=expression-not-assigned 132 output | 'write' >> WriteToText(known_args.output) 133 134 result = p.run() 135 result.wait_until_finish() 136 137 # Do not query metrics when creating a template which doesn't run 138 if (not hasattr(result, 'has_job') # direct runner 139 or result.has_job): # not just a template creation 140 empty_lines_filter = MetricsFilter().with_name('empty_lines') 141 query_result = result.metrics().query(empty_lines_filter) 142 if query_result['counters']: 143 empty_lines_counter = query_result['counters'][0] 144 logging.info('number of empty lines: %d', empty_lines_counter.result) 145 146 word_lengths_filter = MetricsFilter().with_name('word_len_dist') 147 query_result = result.metrics().query(word_lengths_filter) 148 if query_result['distributions']: 149 word_lengths_dist = query_result['distributions'][0] 150 logging.info('average word length: %d', word_lengths_dist.result.mean) 151 152 153 if __name__ == '__main__': 154 logging.getLogger().setLevel(logging.INFO) 155 main()