github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/wordcount_debugging.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """An example that verifies the counts and includes best practices. 19 20 On top of the basic concepts in the wordcount example, this workflow introduces 21 logging to Cloud Logging, and using assertions in a Dataflow pipeline. 22 23 To execute this pipeline locally, specify a local output file or output prefix 24 on GCS:: 25 26 --output [YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX] 27 28 To execute this pipeline using the Google Cloud Dataflow service, specify 29 pipeline configuration:: 30 31 --project YOUR_PROJECT_ID 32 --staging_location gs://YOUR_STAGING_DIRECTORY 33 --temp_location gs://YOUR_TEMP_DIRECTORY 34 --region GCE_REGION 35 --job_name YOUR_JOB_NAME 36 --runner DataflowRunner 37 38 and an output prefix on GCS:: 39 40 --output gs://YOUR_OUTPUT_PREFIX 41 """ 42 43 # pytype: skip-file 44 45 # beam-playground: 46 # name: WordCountDebugging 47 # description: An example that counts words in Shakespeare's works. 48 # includes regex filter("Flourish|stomach"). 49 # multifile: false 50 # pipeline_options: --output output.txt 51 # context_line: 74 52 # categories: 53 # - Flatten 54 # - Debugging 55 # - Options 56 # - Combiners 57 # - Filtering 58 # - Quickstart 59 # complexity: ADVANCED 60 # tags: 61 # - count 62 # - debug 63 # - string 64 65 import argparse 66 import logging 67 import re 68 69 import apache_beam as beam 70 from apache_beam.io import ReadFromText 71 from apache_beam.io import WriteToText 72 from apache_beam.metrics import Metrics 73 from apache_beam.options.pipeline_options import PipelineOptions 74 from apache_beam.options.pipeline_options import SetupOptions 75 from apache_beam.testing.util import assert_that 76 from apache_beam.testing.util import equal_to 77 78 79 class FilterTextFn(beam.DoFn): 80 """A DoFn that filters for a specific key based on a regular expression.""" 81 def __init__(self, pattern): 82 # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3. 83 # super().__init__() 84 beam.DoFn.__init__(self) 85 self.pattern = pattern 86 # A custom metric can track values in your pipeline as it runs. Those 87 # values will be available in the monitoring system of the runner used 88 # to run the pipeline. These metrics below track the number of 89 # matched and unmatched words. 90 self.matched_words = Metrics.counter(self.__class__, 'matched_words') 91 self.umatched_words = Metrics.counter(self.__class__, 'umatched_words') 92 93 def process(self, element): 94 word, _ = element 95 if re.match(self.pattern, word): 96 # Log at INFO level each element we match. When executing this pipeline 97 # using the Dataflow service, these log lines will appear in the Cloud 98 # Logging UI. 99 logging.info('Matched %s', word) 100 self.matched_words.inc() 101 yield element 102 else: 103 # Log at the "DEBUG" level each element that is not matched. Different log 104 # levels can be used to control the verbosity of logging providing an 105 # effective mechanism to filter less important information. 106 # Note currently only "INFO" and higher level logs are emitted to the 107 # Cloud Logger. This log message will not be visible in the Cloud Logger. 108 logging.debug('Did not match %s', word) 109 self.umatched_words.inc() 110 111 112 class CountWords(beam.PTransform): 113 """A transform to count the occurrences of each word. 114 115 A PTransform that converts a PCollection containing lines of text into a 116 PCollection of (word, count) tuples. 117 """ 118 def expand(self, pcoll): 119 def count_ones(word_ones): 120 (word, ones) = word_ones 121 return (word, sum(ones)) 122 123 return ( 124 pcoll 125 | 'split' >> ( 126 beam.FlatMap( 127 lambda x: re.findall(r'[A-Za-z\']+', x)).with_output_types(str)) 128 | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) 129 | 'group' >> beam.GroupByKey() 130 | 'count' >> beam.Map(count_ones)) 131 132 133 def run(argv=None, save_main_session=True): 134 """Runs the debugging wordcount pipeline.""" 135 136 parser = argparse.ArgumentParser() 137 parser.add_argument( 138 '--input', 139 dest='input', 140 default='gs://dataflow-samples/shakespeare/kinglear.txt', 141 help='Input file to process.') 142 parser.add_argument( 143 '--output', 144 dest='output', 145 required=True, 146 help='Output file to write results to.') 147 known_args, pipeline_args = parser.parse_known_args(argv) 148 # We use the save_main_session option because one or more DoFn's in this 149 # workflow rely on global context (e.g., a module imported at module level). 150 pipeline_options = PipelineOptions(pipeline_args) 151 pipeline_options.view_as(SetupOptions).save_main_session = save_main_session 152 with beam.Pipeline(options=pipeline_options) as p: 153 154 # Read the text file[pattern] into a PCollection, count the occurrences of 155 # each word and filter by a list of words. 156 filtered_words = ( 157 p | 'read' >> ReadFromText(known_args.input) 158 | CountWords() 159 | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach'))) 160 161 # assert_that is a convenient PTransform that checks a PCollection has an 162 # expected value. Asserts are best used in unit tests with small data sets 163 # but is demonstrated here as a teaching tool. 164 # 165 # Note assert_that does not provide any output and that successful 166 # completion of the Pipeline implies that the expectations were met. Learn 167 # more at https://cloud.google.com/dataflow/pipelines/testing-your-pipeline 168 # on how to best test your pipeline. 169 assert_that(filtered_words, equal_to([('Flourish', 3), ('stomach', 1)])) 170 171 # Format the counts into a PCollection of strings and write the output using 172 # a "Write" transform that has side effects. 173 # pylint: disable=unused-variable 174 def format_result(word_count): 175 (word, count) = word_count 176 return '%s: %s' % (word, count) 177 178 output = ( 179 filtered_words 180 | 'format' >> beam.Map(format_result) 181 | 'write' >> WriteToText(known_args.output)) 182 183 184 if __name__ == '__main__': 185 # Cloud Logging would contain only logging.INFO and higher level logs logged 186 # by the root logger. All log statements emitted by the root logger will be 187 # visible in the Cloud Logging UI. Learn more at 188 # https://cloud.google.com/logging about the Cloud Logging UI. 189 # 190 # You can set the default logging level to a different level when running 191 # locally. 192 logging.getLogger().setLevel(logging.INFO) 193 run()