github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/wordcount_debugging.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """An example that verifies the counts and includes best practices.
    19  
    20  On top of the basic concepts in the wordcount example, this workflow introduces
    21  logging to Cloud Logging, and using assertions in a Dataflow pipeline.
    22  
    23  To execute this pipeline locally, specify a local output file or output prefix
    24  on GCS::
    25  
    26    --output [YOUR_LOCAL_FILE | gs://YOUR_OUTPUT_PREFIX]
    27  
    28  To execute this pipeline using the Google Cloud Dataflow service, specify
    29  pipeline configuration::
    30  
    31    --project YOUR_PROJECT_ID
    32    --staging_location gs://YOUR_STAGING_DIRECTORY
    33    --temp_location gs://YOUR_TEMP_DIRECTORY
    34    --region GCE_REGION
    35    --job_name YOUR_JOB_NAME
    36    --runner DataflowRunner
    37  
    38  and an output prefix on GCS::
    39  
    40    --output gs://YOUR_OUTPUT_PREFIX
    41  """
    42  
    43  # pytype: skip-file
    44  
    45  # beam-playground:
    46  #   name: WordCountDebugging
    47  #   description: An example that counts words in Shakespeare's works.
    48  #     includes regex filter("Flourish|stomach").
    49  #   multifile: false
    50  #   pipeline_options: --output output.txt
    51  #   context_line: 74
    52  #   categories:
    53  #     - Flatten
    54  #     - Debugging
    55  #     - Options
    56  #     - Combiners
    57  #     - Filtering
    58  #     - Quickstart
    59  #   complexity: ADVANCED
    60  #   tags:
    61  #     - count
    62  #     - debug
    63  #     - string
    64  
    65  import argparse
    66  import logging
    67  import re
    68  
    69  import apache_beam as beam
    70  from apache_beam.io import ReadFromText
    71  from apache_beam.io import WriteToText
    72  from apache_beam.metrics import Metrics
    73  from apache_beam.options.pipeline_options import PipelineOptions
    74  from apache_beam.options.pipeline_options import SetupOptions
    75  from apache_beam.testing.util import assert_that
    76  from apache_beam.testing.util import equal_to
    77  
    78  
    79  class FilterTextFn(beam.DoFn):
    80    """A DoFn that filters for a specific key based on a regular expression."""
    81    def __init__(self, pattern):
    82      # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3.
    83      # super().__init__()
    84      beam.DoFn.__init__(self)
    85      self.pattern = pattern
    86      # A custom metric can track values in your pipeline as it runs. Those
    87      # values will be available in the monitoring system of the runner used
    88      # to run the pipeline. These metrics below track the number of
    89      # matched and unmatched words.
    90      self.matched_words = Metrics.counter(self.__class__, 'matched_words')
    91      self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')
    92  
    93    def process(self, element):
    94      word, _ = element
    95      if re.match(self.pattern, word):
    96        # Log at INFO level each element we match. When executing this pipeline
    97        # using the Dataflow service, these log lines will appear in the Cloud
    98        # Logging UI.
    99        logging.info('Matched %s', word)
   100        self.matched_words.inc()
   101        yield element
   102      else:
   103        # Log at the "DEBUG" level each element that is not matched. Different log
   104        # levels can be used to control the verbosity of logging providing an
   105        # effective mechanism to filter less important information.
   106        # Note currently only "INFO" and higher level logs are emitted to the
   107        # Cloud Logger. This log message will not be visible in the Cloud Logger.
   108        logging.debug('Did not match %s', word)
   109        self.umatched_words.inc()
   110  
   111  
   112  class CountWords(beam.PTransform):
   113    """A transform to count the occurrences of each word.
   114  
   115    A PTransform that converts a PCollection containing lines of text into a
   116    PCollection of (word, count) tuples.
   117    """
   118    def expand(self, pcoll):
   119      def count_ones(word_ones):
   120        (word, ones) = word_ones
   121        return (word, sum(ones))
   122  
   123      return (
   124          pcoll
   125          | 'split' >> (
   126              beam.FlatMap(
   127                  lambda x: re.findall(r'[A-Za-z\']+', x)).with_output_types(str))
   128          | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
   129          | 'group' >> beam.GroupByKey()
   130          | 'count' >> beam.Map(count_ones))
   131  
   132  
   133  def run(argv=None, save_main_session=True):
   134    """Runs the debugging wordcount pipeline."""
   135  
   136    parser = argparse.ArgumentParser()
   137    parser.add_argument(
   138        '--input',
   139        dest='input',
   140        default='gs://dataflow-samples/shakespeare/kinglear.txt',
   141        help='Input file to process.')
   142    parser.add_argument(
   143        '--output',
   144        dest='output',
   145        required=True,
   146        help='Output file to write results to.')
   147    known_args, pipeline_args = parser.parse_known_args(argv)
   148    # We use the save_main_session option because one or more DoFn's in this
   149    # workflow rely on global context (e.g., a module imported at module level).
   150    pipeline_options = PipelineOptions(pipeline_args)
   151    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
   152    with beam.Pipeline(options=pipeline_options) as p:
   153  
   154      # Read the text file[pattern] into a PCollection, count the occurrences of
   155      # each word and filter by a list of words.
   156      filtered_words = (
   157          p | 'read' >> ReadFromText(known_args.input)
   158          | CountWords()
   159          | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))
   160  
   161      # assert_that is a convenient PTransform that checks a PCollection has an
   162      # expected value. Asserts are best used in unit tests with small data sets
   163      # but is demonstrated here as a teaching tool.
   164      #
   165      # Note assert_that does not provide any output and that successful
   166      # completion of the Pipeline implies that the expectations were  met. Learn
   167      # more at https://cloud.google.com/dataflow/pipelines/testing-your-pipeline
   168      # on how to best test your pipeline.
   169      assert_that(filtered_words, equal_to([('Flourish', 3), ('stomach', 1)]))
   170  
   171      # Format the counts into a PCollection of strings and write the output using
   172      # a "Write" transform that has side effects.
   173      # pylint: disable=unused-variable
   174      def format_result(word_count):
   175        (word, count) = word_count
   176        return '%s: %s' % (word, count)
   177  
   178      output = (
   179          filtered_words
   180          | 'format' >> beam.Map(format_result)
   181          | 'write' >> WriteToText(known_args.output))
   182  
   183  
   184  if __name__ == '__main__':
   185    # Cloud Logging would contain only logging.INFO and higher level logs logged
   186    # by the root logger. All log statements emitted by the root logger will be
   187    # visible in the Cloud Logging UI. Learn more at
   188    # https://cloud.google.com/logging about the Cloud Logging UI.
   189    #
   190    # You can set the default logging level to a different level when running
   191    # locally.
   192    logging.getLogger().setLevel(logging.INFO)
   193    run()