github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/snippets/snippets.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/snippets/snippets.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Code snippets used in webdocs.
    19  
    20  The examples here are written specifically to read well with the accompanying
    21  web docs. Do not rewrite them until you make sure the webdocs still read well
    22  and the rewritten code supports the concept being described. For example, there
    23  are snippets that could be shorter but they are written like this to make a
    24  specific point in the docs.
    25  
    26  The code snippets are all organized as self contained functions. Parts of the
    27  function body delimited by [START tag] and [END tag] will be included
    28  automatically in the web docs. The naming convention for the tags is to have as
    29  prefix the PATH_TO_HTML where they are included followed by a descriptive
    30  string. The tags can contain only letters, digits and _.
    31  """
    32  # pytype: skip-file
    33  
    34  import argparse
    35  import base64
    36  import json
    37  from decimal import Decimal
    38  
    39  import mock
    40  
    41  import apache_beam as beam
    42  from apache_beam.io import iobase
    43  from apache_beam.io.range_trackers import OffsetRangeTracker
    44  from apache_beam.metrics import Metrics
    45  from apache_beam.options.pipeline_options import PipelineOptions
    46  from apache_beam.testing.test_pipeline import TestPipeline
    47  from apache_beam.testing.util import assert_that
    48  from apache_beam.testing.util import equal_to
    49  from apache_beam.transforms.core import PTransform
    50  
    51  # Protect against environments where Google Cloud Natural Language client is
    52  # not available.
    53  try:
    54    from apache_beam.ml.gcp import naturallanguageml as nlp
    55  except ImportError:
    56    nlp = None
    57  
    58  # Quiet some pylint warnings that happen because of the somewhat special
    59  # format for the code snippets.
    60  # pylint:disable=invalid-name
    61  # pylint:disable=expression-not-assigned
    62  # pylint:disable=redefined-outer-name
    63  # pylint:disable=reimported
    64  # pylint:disable=unused-variable
    65  # pylint:disable=wrong-import-order, wrong-import-position
    66  
    67  
    68  class SnippetUtils(object):
    69    from apache_beam.pipeline import PipelineVisitor
    70  
    71    class RenameFiles(PipelineVisitor):
    72      """RenameFiles will rewire read/write paths for unit testing.
    73  
    74      RenameFiles will replace the GCS files specified in the read and
    75      write transforms to local files so the pipeline can be run as a
    76      unit test. This assumes that read and write transforms defined in snippets
    77      have already been replaced by transforms 'DummyReadForTesting' and
    78      'DummyReadForTesting' (see snippets_test.py).
    79  
    80      This is as close as we can get to have code snippets that are
    81      executed and are also ready to presented in webdocs.
    82      """
    83      def __init__(self, renames):
    84        self.renames = renames
    85  
    86      def visit_transform(self, transform_node):
    87        if transform_node.full_label.find('DummyReadForTesting') >= 0:
    88          transform_node.transform.fn.file_to_read = self.renames['read']
    89        elif transform_node.full_label.find('DummyWriteForTesting') >= 0:
    90          transform_node.transform.fn.file_to_write = self.renames['write']
    91  
    92  
    93  @mock.patch('apache_beam.Pipeline', TestPipeline)
    94  def construct_pipeline(renames):
    95    """A reverse words snippet as an example for constructing a pipeline."""
    96    import re
    97  
    98    # This is duplicate of the import statement in
    99    # pipelines_constructing_creating tag below, but required to avoid
   100    # Unresolved reference in ReverseWords class
   101    import apache_beam as beam
   102  
   103    @beam.ptransform_fn
   104    @beam.typehints.with_input_types(str)
   105    @beam.typehints.with_output_types(str)
   106    def ReverseWords(pcoll):
   107      """A PTransform that reverses individual elements in a PCollection."""
   108      return pcoll | beam.Map(lambda word: word[::-1])
   109  
   110    def filter_words(unused_x):
   111      """Pass through filter to select everything."""
   112      return True
   113  
   114    # [START pipelines_constructing_creating]
   115    import apache_beam as beam
   116  
   117    with beam.Pipeline() as pipeline:
   118      pass  # build your pipeline here
   119      # [END pipelines_constructing_creating]
   120  
   121      # [START pipelines_constructing_reading]
   122      lines = pipeline | 'ReadMyFile' >> beam.io.ReadFromText(
   123          'gs://some/inputData.txt')
   124      # [END pipelines_constructing_reading]
   125  
   126      # [START pipelines_constructing_applying]
   127      words = lines | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
   128      reversed_words = words | ReverseWords()
   129      # [END pipelines_constructing_applying]
   130  
   131      # [START pipelines_constructing_writing]
   132      filtered_words = reversed_words | 'FilterWords' >> beam.Filter(filter_words)
   133      filtered_words | 'WriteMyFile' >> beam.io.WriteToText(
   134          'gs://some/outputData.txt')
   135      # [END pipelines_constructing_writing]
   136  
   137      pipeline.visit(SnippetUtils.RenameFiles(renames))
   138  
   139  
   140  def model_pipelines():
   141    """A wordcount snippet as a simple pipeline example."""
   142    # [START model_pipelines]
   143    import argparse
   144    import re
   145  
   146    import apache_beam as beam
   147    from apache_beam.options.pipeline_options import PipelineOptions
   148  
   149    parser = argparse.ArgumentParser()
   150    parser.add_argument(
   151        '--input-file',
   152        default='gs://dataflow-samples/shakespeare/kinglear.txt',
   153        help='The file path for the input text to process.')
   154    parser.add_argument(
   155        '--output-path', required=True, help='The path prefix for output files.')
   156    args, beam_args = parser.parse_known_args()
   157  
   158    beam_options = PipelineOptions(beam_args)
   159    with beam.Pipeline(options=beam_options) as pipeline:
   160      (
   161          pipeline
   162          | beam.io.ReadFromText(args.input_file)
   163          | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
   164          | beam.Map(lambda x: (x, 1))
   165          | beam.combiners.Count.PerKey()
   166          | beam.io.WriteToText(args.output_path))
   167    # [END model_pipelines]
   168  
   169  
   170  def model_pcollection(output_path):
   171    """Creating a PCollection from data in local memory."""
   172    # [START model_pcollection]
   173    import apache_beam as beam
   174  
   175    with beam.Pipeline() as pipeline:
   176      lines = (
   177          pipeline
   178          | beam.Create([
   179              'To be, or not to be: that is the question: ',
   180              "Whether 'tis nobler in the mind to suffer ",
   181              'The slings and arrows of outrageous fortune, ',
   182              'Or to take arms against a sea of troubles, ',
   183          ]))
   184      # [END model_pcollection]
   185  
   186      lines | beam.io.WriteToText(output_path)
   187  
   188  
   189  def pipeline_options_remote():
   190    """Creating a Pipeline using a PipelineOptions object for remote execution."""
   191  
   192    # [START pipeline_options_create]
   193    from apache_beam.options.pipeline_options import PipelineOptions
   194  
   195    beam_options = PipelineOptions()
   196    # [END pipeline_options_create]
   197  
   198    # [START pipeline_options_define_custom]
   199    from apache_beam.options.pipeline_options import PipelineOptions
   200  
   201    class MyOptions(PipelineOptions):
   202      @classmethod
   203      def _add_argparse_args(cls, parser):
   204        parser.add_argument('--input')
   205        parser.add_argument('--output')
   206  
   207    # [END pipeline_options_define_custom]
   208  
   209    @mock.patch('apache_beam.Pipeline')
   210    def dataflow_options(mock_pipeline):
   211      # [START pipeline_options_dataflow_service]
   212      import argparse
   213  
   214      import apache_beam as beam
   215      from apache_beam.options.pipeline_options import PipelineOptions
   216  
   217      parser = argparse.ArgumentParser()
   218      # parser.add_argument('--my-arg', help='description')
   219      args, beam_args = parser.parse_known_args()
   220  
   221      # Create and set your PipelineOptions.
   222      # For Cloud execution, specify DataflowRunner and set the Cloud Platform
   223      # project, job name, temporary files location, and region.
   224      # For more information about regions, check:
   225      # https://cloud.google.com/dataflow/docs/concepts/regional-endpoints
   226      beam_options = PipelineOptions(
   227          beam_args,
   228          runner='DataflowRunner',
   229          project='my-project-id',
   230          job_name='unique-job-name',
   231          temp_location='gs://my-bucket/temp',
   232          region='us-central1')
   233      # Note: Repeatable options like dataflow_service_options or experiments must
   234      # be specified as a list of string(s).
   235      # e.g. dataflow_service_options=['enable_prime']
   236  
   237      # Create the Pipeline with the specified options.
   238      with beam.Pipeline(options=beam_options) as pipeline:
   239        pass  # build your pipeline here.
   240      # [END pipeline_options_dataflow_service]
   241      return beam_options
   242  
   243    beam_options = dataflow_options()
   244    args = beam_options.view_as(MyOptions)
   245  
   246    with TestPipeline() as pipeline:  # Use TestPipeline for testing.
   247      lines = pipeline | beam.io.ReadFromText(args.input)
   248      lines | beam.io.WriteToText(args.output)
   249  
   250  
   251  @mock.patch('apache_beam.Pipeline', TestPipeline)
   252  def pipeline_options_local():
   253    """Creating a Pipeline using a PipelineOptions object for local execution."""
   254  
   255    # [START pipeline_options_define_custom_with_help_and_default]
   256    from apache_beam.options.pipeline_options import PipelineOptions
   257  
   258    class MyOptions(PipelineOptions):
   259      @classmethod
   260      def _add_argparse_args(cls, parser):
   261        parser.add_argument(
   262            '--input',
   263            default='gs://dataflow-samples/shakespeare/kinglear.txt',
   264            help='The file path for the input text to process.')
   265        parser.add_argument(
   266            '--output', required=True, help='The path prefix for output files.')
   267  
   268    # [END pipeline_options_define_custom_with_help_and_default]
   269  
   270    # [START pipeline_options_local]
   271    import argparse
   272  
   273    import apache_beam as beam
   274    from apache_beam.options.pipeline_options import PipelineOptions
   275  
   276    parser = argparse.ArgumentParser()
   277    # parser.add_argument('--my-arg')
   278    args, beam_args = parser.parse_known_args()
   279  
   280    # Create and set your Pipeline Options.
   281    beam_options = PipelineOptions(beam_args)
   282    args = beam_options.view_as(MyOptions)
   283  
   284    with beam.Pipeline(options=beam_options) as pipeline:
   285      lines = (
   286          pipeline
   287          | beam.io.ReadFromText(args.input)
   288          | beam.io.WriteToText(args.output))
   289    # [END pipeline_options_local]
   290  
   291  
   292  @mock.patch('apache_beam.Pipeline', TestPipeline)
   293  def pipeline_options_command_line():
   294    """Creating a Pipeline by passing a list of arguments."""
   295  
   296    # [START pipeline_options_command_line]
   297    # Use Python argparse module to parse custom arguments
   298    import argparse
   299  
   300    import apache_beam as beam
   301    from apache_beam.options.pipeline_options import PipelineOptions
   302  
   303    # For more details on how to use argparse, take a look at:
   304    #   https://docs.python.org/3/library/argparse.html
   305    parser = argparse.ArgumentParser()
   306    parser.add_argument(
   307        '--input-file',
   308        default='gs://dataflow-samples/shakespeare/kinglear.txt',
   309        help='The file path for the input text to process.')
   310    parser.add_argument(
   311        '--output-path', required=True, help='The path prefix for output files.')
   312    args, beam_args = parser.parse_known_args()
   313  
   314    # Create the Pipeline with remaining arguments.
   315    beam_options = PipelineOptions(beam_args)
   316    with beam.Pipeline(options=beam_options) as pipeline:
   317      lines = (
   318          pipeline
   319          | 'Read files' >> beam.io.ReadFromText(args.input_file)
   320          | 'Write files' >> beam.io.WriteToText(args.output_path))
   321    # [END pipeline_options_command_line]
   322  
   323  
   324  def pipeline_logging(lines, output):
   325    """Logging Pipeline Messages."""
   326  
   327    import re
   328    import apache_beam as beam
   329  
   330    # [START pipeline_logging]
   331    # import Python logging module.
   332    import logging
   333  
   334    class ExtractWordsFn(beam.DoFn):
   335      def process(self, element):
   336        words = re.findall(r'[A-Za-z\']+', element)
   337        for word in words:
   338          yield word
   339  
   340          if word.lower() == 'love':
   341            # Log using the root logger at info or higher levels
   342            logging.info('Found : %s', word.lower())
   343  
   344    # Remaining WordCount example code ...
   345    # [END pipeline_logging]
   346  
   347    with TestPipeline() as pipeline:  # Use TestPipeline for testing.
   348      (
   349          pipeline
   350          | beam.Create(lines)
   351          | beam.ParDo(ExtractWordsFn())
   352          | beam.io.WriteToText(output))
   353  
   354  
   355  def pipeline_monitoring():
   356    """Using monitoring interface snippets."""
   357  
   358    import argparse
   359    import re
   360    import apache_beam as beam
   361  
   362    class ExtractWordsFn(beam.DoFn):
   363      def process(self, element):
   364        words = re.findall(r'[A-Za-z\']+', element)
   365        for word in words:
   366          yield word
   367  
   368    class FormatCountsFn(beam.DoFn):
   369      def process(self, element):
   370        word, count = element
   371        yield '%s: %s' % (word, count)
   372  
   373    # [START pipeline_monitoring_composite]
   374    # The CountWords Composite Transform inside the WordCount pipeline.
   375    @beam.ptransform_fn
   376    def CountWords(pcoll):
   377      return (
   378          pcoll
   379          # Convert lines of text into individual words.
   380          | 'ExtractWords' >> beam.ParDo(ExtractWordsFn())
   381          # Count the number of times each word occurs.
   382          | beam.combiners.Count.PerElement()
   383          # Format each word and count into a printable string.
   384          | 'FormatCounts' >> beam.ParDo(FormatCountsFn()))
   385  
   386    # [END pipeline_monitoring_composite]
   387  
   388    parser = argparse.ArgumentParser()
   389    parser.add_argument(
   390        '--input-file',
   391        default='gs://dataflow-samples/shakespeare/kinglear.txt',
   392        help='The file path for the input text to process.')
   393    parser.add_argument(
   394        '--output-path', required=True, help='The path prefix for output files.')
   395    args, _ = parser.parse_known_args()
   396  
   397    with TestPipeline() as pipeline:  # Use TestPipeline for testing.
   398  
   399      # [START pipeline_monitoring_execution]
   400      (
   401          pipeline
   402          # Read the lines of the input text.
   403          | 'ReadLines' >> beam.io.ReadFromText(args.input_file)
   404          # Count the words.
   405          | CountWords()
   406          # Write the formatted word counts to output.
   407          | 'WriteCounts' >> beam.io.WriteToText(args.output_path))
   408      # [END pipeline_monitoring_execution]
   409  
   410  
   411  def examples_wordcount_minimal():
   412    """MinimalWordCount example snippets."""
   413    import re
   414  
   415    import apache_beam as beam
   416  
   417    # [START examples_wordcount_minimal_options]
   418    from apache_beam.options.pipeline_options import PipelineOptions
   419  
   420    input_file = 'gs://dataflow-samples/shakespeare/kinglear.txt'
   421    output_path = 'gs://my-bucket/counts.txt'
   422  
   423    beam_options = PipelineOptions(
   424        runner='DataflowRunner',
   425        project='my-project-id',
   426        job_name='unique-job-name',
   427        temp_location='gs://my-bucket/temp',
   428    )
   429    # [END examples_wordcount_minimal_options]
   430  
   431    # Run it locally for testing.
   432    import argparse
   433  
   434    parser = argparse.ArgumentParser()
   435    parser.add_argument('--input-file')
   436    parser.add_argument('--output-path')
   437    args, beam_args = parser.parse_known_args()
   438  
   439    input_file = args.input_file
   440    output_path = args.output_path
   441  
   442    beam_options = PipelineOptions(beam_args)
   443  
   444    # [START examples_wordcount_minimal_create]
   445    pipeline = beam.Pipeline(options=beam_options)
   446    # [END examples_wordcount_minimal_create]
   447  
   448    (
   449        # [START examples_wordcount_minimal_read]
   450        pipeline
   451        | beam.io.ReadFromText(input_file)
   452        # [END examples_wordcount_minimal_read]
   453  
   454        # [START examples_wordcount_minimal_pardo]
   455        | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
   456        # [END examples_wordcount_minimal_pardo]
   457  
   458        # [START examples_wordcount_minimal_count]
   459        | beam.combiners.Count.PerElement()
   460        # [END examples_wordcount_minimal_count]
   461  
   462        # [START examples_wordcount_minimal_map]
   463        | beam.MapTuple(lambda word, count: '%s: %s' % (word, count))
   464        # [END examples_wordcount_minimal_map]
   465  
   466        # [START examples_wordcount_minimal_write]
   467        | beam.io.WriteToText(output_path)
   468        # [END examples_wordcount_minimal_write]
   469    )
   470  
   471    # [START examples_wordcount_minimal_run]
   472    result = pipeline.run()
   473    # [END examples_wordcount_minimal_run]
   474    result.wait_until_finish()
   475  
   476  
   477  def examples_wordcount_wordcount():
   478    """WordCount example snippets."""
   479    import re
   480  
   481    import apache_beam as beam
   482    from apache_beam.options.pipeline_options import PipelineOptions
   483  
   484    # [START examples_wordcount_wordcount_options]
   485    import argparse
   486  
   487    parser = argparse.ArgumentParser()
   488    parser.add_argument(
   489        '--input-file',
   490        default='gs://dataflow-samples/shakespeare/kinglear.txt',
   491        help='The file path for the input text to process.')
   492    parser.add_argument(
   493        '--output-path', required=True, help='The path prefix for output files.')
   494    args, beam_args = parser.parse_known_args()
   495  
   496    beam_options = PipelineOptions(beam_args)
   497    with beam.Pipeline(options=beam_options) as pipeline:
   498      lines = pipeline | beam.io.ReadFromText(args.input_file)
   499  
   500      # [END examples_wordcount_wordcount_options]
   501  
   502      # [START examples_wordcount_wordcount_composite]
   503      @beam.ptransform_fn
   504      def CountWords(pcoll):
   505        return (
   506            pcoll
   507            # Convert lines of text into individual words.
   508            | 'ExtractWords' >>
   509            beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
   510  
   511            # Count the number of times each word occurs.
   512            | beam.combiners.Count.PerElement())
   513  
   514      counts = lines | CountWords()
   515  
   516      # [END examples_wordcount_wordcount_composite]
   517  
   518      # [START examples_wordcount_wordcount_dofn]
   519      class FormatAsTextFn(beam.DoFn):
   520        def process(self, element):
   521          word, count = element
   522          yield '%s: %s' % (word, count)
   523  
   524      formatted = counts | beam.ParDo(FormatAsTextFn())
   525      # [END examples_wordcount_wordcount_dofn]
   526  
   527      formatted | beam.io.WriteToText(args.output_path)
   528  
   529  
   530  def examples_wordcount_templated():
   531    """Templated WordCount example snippet."""
   532    import re
   533  
   534    import apache_beam as beam
   535    from apache_beam.io import ReadFromText
   536    from apache_beam.io import WriteToText
   537    from apache_beam.options.pipeline_options import PipelineOptions
   538  
   539    # [START example_wordcount_templated]
   540    class WordcountTemplatedOptions(PipelineOptions):
   541      @classmethod
   542      def _add_argparse_args(cls, parser):
   543        # Use add_value_provider_argument for arguments to be templatable
   544        # Use add_argument as usual for non-templatable arguments
   545        parser.add_value_provider_argument(
   546            '--input-file',
   547            default='gs://dataflow-samples/shakespeare/kinglear.txt',
   548            help='The file path for the input text to process.')
   549        parser.add_argument(
   550            '--output-path',
   551            required=True,
   552            help='The path prefix for output files.')
   553  
   554    beam_options = PipelineOptions()
   555    args = beam_options.view_as(WordcountTemplatedOptions)
   556  
   557    with beam.Pipeline(options=beam_options) as pipeline:
   558      lines = pipeline | 'Read' >> ReadFromText(args.input_file.get())
   559  
   560      # [END example_wordcount_templated]
   561  
   562      def format_result(word_count):
   563        (word, count) = word_count
   564        return '%s: %s' % (word, count)
   565  
   566      (
   567          lines
   568          |
   569          'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
   570          | 'PairWithOnes' >> beam.Map(lambda x: (x, 1))
   571          | 'Group' >> beam.GroupByKey()
   572          |
   573          'Sum' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1])))
   574          | 'Format' >> beam.Map(format_result)
   575          | 'Write' >> WriteToText(args.output_path))
   576  
   577  
   578  def examples_wordcount_debugging(renames):
   579    """DebuggingWordCount example snippets."""
   580    import re
   581  
   582    import apache_beam as beam
   583  
   584    # [START example_wordcount_debugging_logging]
   585    # [START example_wordcount_debugging_aggregators]
   586    import logging
   587  
   588    class FilterTextFn(beam.DoFn):
   589      """A DoFn that filters for a specific key based on a regular expression."""
   590      def __init__(self, pattern):
   591        self.pattern = pattern
   592        # A custom metric can track values in your pipeline as it runs. Create
   593        # custom metrics matched_word and unmatched_words.
   594        self.matched_words = Metrics.counter(self.__class__, 'matched_words')
   595        self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')
   596  
   597      def process(self, element):
   598        word, _ = element
   599        if re.match(self.pattern, word):
   600          # Log at INFO level each element we match. When executing this pipeline
   601          # using the Dataflow service, these log lines will appear in the Cloud
   602          # Logging UI.
   603          logging.info('Matched %s', word)
   604  
   605          # Add 1 to the custom metric counter matched_words
   606          self.matched_words.inc()
   607          yield element
   608        else:
   609          # Log at the "DEBUG" level each element that is not matched. Different
   610          # log levels can be used to control the verbosity of logging providing
   611          # an effective mechanism to filter less important information. Note
   612          # currently only "INFO" and higher level logs are emitted to the Cloud
   613          # Logger. This log message will not be visible in the Cloud Logger.
   614          logging.debug('Did not match %s', word)
   615  
   616          # Add 1 to the custom metric counter umatched_words
   617          self.umatched_words.inc()
   618  
   619    # [END example_wordcount_debugging_logging]
   620    # [END example_wordcount_debugging_aggregators]
   621  
   622    with TestPipeline() as pipeline:  # Use TestPipeline for testing.
   623      filtered_words = (
   624          pipeline
   625          |
   626          beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt')
   627          |
   628          'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
   629          | beam.combiners.Count.PerElement()
   630          | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))
   631  
   632      # [START example_wordcount_debugging_assert]
   633      beam.testing.util.assert_that(
   634          filtered_words,
   635          beam.testing.util.equal_to([('Flourish', 3), ('stomach', 1)]))
   636  
   637      # [END example_wordcount_debugging_assert]
   638  
   639      def format_result(word_count):
   640        (word, count) = word_count
   641        return '%s: %s' % (word, count)
   642  
   643      output = (
   644          filtered_words
   645          | 'format' >> beam.Map(format_result)
   646          | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt'))
   647  
   648      pipeline.visit(SnippetUtils.RenameFiles(renames))
   649  
   650  
   651  def examples_wordcount_streaming():
   652    import apache_beam as beam
   653    from apache_beam import window
   654    from apache_beam.options.pipeline_options import PipelineOptions
   655  
   656    # Parse out arguments.
   657    parser = argparse.ArgumentParser()
   658    parser.add_argument(
   659        '--output_topic',
   660        required=True,
   661        help=(
   662            'Output PubSub topic of the form '
   663            '"projects/<PROJECT>/topic/<TOPIC>".'))
   664    group = parser.add_mutually_exclusive_group(required=True)
   665    group.add_argument(
   666        '--input_topic',
   667        help=(
   668            'Input PubSub topic of the form '
   669            '"projects/<PROJECT>/topics/<TOPIC>".'))
   670    group.add_argument(
   671        '--input_subscription',
   672        help=(
   673            'Input PubSub subscription of the form '
   674            '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
   675    args, beam_args = parser.parse_known_args()
   676  
   677    beam_options = PipelineOptions(beam_args, streaming=True)
   678  
   679    with TestPipeline(options=beam_options) as pipeline:
   680      # [START example_wordcount_streaming_read]
   681      # Read from Pub/Sub into a PCollection.
   682      if args.input_subscription:
   683        lines = pipeline | beam.io.ReadFromPubSub(
   684            subscription=args.input_subscription)
   685      else:
   686        lines = pipeline | beam.io.ReadFromPubSub(topic=args.input_topic)
   687      # [END example_wordcount_streaming_read]
   688  
   689      output = (
   690          lines
   691          | 'DecodeUnicode' >> beam.Map(lambda encoded: encoded.decode('utf-8'))
   692          | 'ExtractWords' >>
   693          beam.FlatMap(lambda x: __import__('re').findall(r'[A-Za-z\']+', x))
   694          | 'PairWithOnes' >> beam.Map(lambda x: (x, 1))
   695          | beam.WindowInto(window.FixedWindows(15, 0))
   696          | 'Group' >> beam.GroupByKey()
   697          |
   698          'Sum' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1])))
   699          | 'Format' >>
   700          beam.MapTuple(lambda word, count: f'{word}: {count}'.encode('utf-8')))
   701  
   702      # [START example_wordcount_streaming_write]
   703      # Write to Pub/Sub
   704      output | beam.io.WriteToPubSub(args.output_topic)
   705      # [END example_wordcount_streaming_write]
   706  
   707  
   708  def examples_ptransforms_templated(renames):
   709    # [START examples_ptransforms_templated]
   710    import apache_beam as beam
   711    from apache_beam.io import WriteToText
   712    from apache_beam.options.pipeline_options import PipelineOptions
   713    from apache_beam.options.value_provider import StaticValueProvider
   714  
   715    class TemplatedUserOptions(PipelineOptions):
   716      @classmethod
   717      def _add_argparse_args(cls, parser):
   718        parser.add_value_provider_argument('--templated_int', type=int)
   719  
   720    class MySumFn(beam.DoFn):
   721      def __init__(self, templated_int):
   722        self.templated_int = templated_int
   723  
   724      def process(self, an_int):
   725        yield self.templated_int.get() + an_int
   726  
   727    beam_options = PipelineOptions()
   728    args = beam_options.view_as(TemplatedUserOptions)
   729  
   730    with beam.Pipeline(options=beam_options) as pipeline:
   731      my_sum_fn = MySumFn(args.templated_int)
   732      sum = (
   733          pipeline
   734          | 'ReadCollection' >>
   735          beam.io.ReadFromText('gs://some/integer_collection')
   736          | 'StringToInt' >> beam.Map(lambda w: int(w))
   737          | 'AddGivenInt' >> beam.ParDo(my_sum_fn)
   738          | 'WriteResultingCollection' >> WriteToText('some/output_path'))
   739      # [END examples_ptransforms_templated]
   740  
   741      # Templates are not supported by DirectRunner (only by DataflowRunner)
   742      # so a value must be provided at graph-construction time
   743      my_sum_fn.templated_int = StaticValueProvider(int, 10)
   744  
   745      pipeline.visit(SnippetUtils.RenameFiles(renames))
   746  
   747  
   748  # Defining a new source.
   749  # [START model_custom_source_new_source]
   750  class CountingSource(iobase.BoundedSource):
   751    def __init__(self, count):
   752      self.records_read = Metrics.counter(self.__class__, 'recordsRead')
   753      self._count = count
   754  
   755    def estimate_size(self):
   756      return self._count
   757  
   758    def get_range_tracker(self, start_position, stop_position):
   759      if start_position is None:
   760        start_position = 0
   761      if stop_position is None:
   762        stop_position = self._count
   763  
   764      return OffsetRangeTracker(start_position, stop_position)
   765  
   766    def read(self, range_tracker):
   767      for i in range(range_tracker.start_position(),
   768                     range_tracker.stop_position()):
   769        if not range_tracker.try_claim(i):
   770          return
   771        self.records_read.inc()
   772        yield i
   773  
   774    def split(self, desired_bundle_size, start_position=None, stop_position=None):
   775      if start_position is None:
   776        start_position = 0
   777      if stop_position is None:
   778        stop_position = self._count
   779  
   780      bundle_start = start_position
   781      while bundle_start < stop_position:
   782        bundle_stop = min(stop_position, bundle_start + desired_bundle_size)
   783        yield iobase.SourceBundle(
   784            weight=(bundle_stop - bundle_start),
   785            source=self,
   786            start_position=bundle_start,
   787            stop_position=bundle_stop)
   788        bundle_start = bundle_stop
   789  
   790  
   791  # [END model_custom_source_new_source]
   792  
   793  
   794  # We recommend users to start Source classes with an underscore to discourage
   795  # using the Source class directly when a PTransform for the source is
   796  # available. We simulate that here by simply extending the previous Source
   797  # class.
   798  class _CountingSource(CountingSource):
   799    pass
   800  
   801  
   802  # [START model_custom_source_new_ptransform]
   803  class ReadFromCountingSource(PTransform):
   804    def __init__(self, count):
   805      super().__init__()
   806      self._count = count
   807  
   808    def expand(self, pcoll):
   809      return pcoll | iobase.Read(_CountingSource(self._count))
   810  
   811  
   812  # [END model_custom_source_new_ptransform]
   813  
   814  
   815  def model_custom_source(count):
   816    """Demonstrates creating a new custom source and using it in a pipeline.
   817  
   818    Defines a new source ``CountingSource`` that produces integers starting from 0
   819    up to a given size.
   820  
   821    Uses the new source in an example pipeline.
   822  
   823    Additionally demonstrates how a source should be implemented using a
   824    ``PTransform``. This is the recommended way to develop sources that are to
   825    distributed to a large number of end users.
   826  
   827    This method runs two pipelines.
   828  
   829    (1) A pipeline that uses ``CountingSource`` directly using the ``df.Read``
   830        transform.
   831    (2) A pipeline that uses a custom ``PTransform`` that wraps
   832        ``CountingSource``.
   833  
   834    Args:
   835      count: the size of the counting source to be used in the pipeline
   836             demonstrated in this method.
   837  
   838    """
   839  
   840    # Using the source in an example pipeline.
   841    # [START model_custom_source_use_new_source]
   842    with beam.Pipeline() as pipeline:
   843      numbers = pipeline | 'ProduceNumbers' >> beam.io.Read(CountingSource(count))
   844      # [END model_custom_source_use_new_source]
   845  
   846      lines = numbers | beam.core.Map(lambda number: 'line %d' % number)
   847      assert_that(
   848          lines, equal_to(['line ' + str(number) for number in range(0, count)]))
   849  
   850    # [START model_custom_source_use_ptransform]
   851    with beam.Pipeline() as pipeline:
   852      numbers = pipeline | 'ProduceNumbers' >> ReadFromCountingSource(count)
   853      # [END model_custom_source_use_ptransform]
   854  
   855      lines = numbers | beam.core.Map(lambda number: 'line %d' % number)
   856      assert_that(
   857          lines, equal_to(['line ' + str(number) for number in range(0, count)]))
   858  
   859  
   860  # Defining the new sink.
   861  #
   862  # Defines a new sink ``SimpleKVSink`` that demonstrates writing to a simple
   863  # key-value based storage system which has following API.
   864  #
   865  #   simplekv.connect(url) -
   866  #       connects to the storage system and returns an access token which can be
   867  #       used to perform further operations
   868  #   simplekv.open_table(access_token, table_name) -
   869  #       creates a table named 'table_name'. Returns a table object.
   870  #   simplekv.write_to_table(access_token, table, key, value) -
   871  #       writes a key-value pair to the given table.
   872  #   simplekv.rename_table(access_token, old_name, new_name) -
   873  #       renames the table named 'old_name' to 'new_name'.
   874  #
   875  # [START model_custom_sink_new_sink]
   876  class SimpleKVSink(iobase.Sink):
   877    def __init__(self, simplekv, url, final_table_name):
   878      self._simplekv = simplekv
   879      self._url = url
   880      self._final_table_name = final_table_name
   881  
   882    def initialize_write(self):
   883      access_token = self._simplekv.connect(self._url)
   884      return access_token
   885  
   886    def open_writer(self, access_token, uid):
   887      table_name = 'table' + uid
   888      return SimpleKVWriter(self._simplekv, access_token, table_name)
   889  
   890    def pre_finalize(self, init_result, writer_results):
   891      pass
   892  
   893    def finalize_write(self, access_token, table_names, pre_finalize_result):
   894      for i, table_name in enumerate(table_names):
   895        self._simplekv.rename_table(
   896            access_token, table_name, self._final_table_name + str(i))
   897  
   898  
   899  # [END model_custom_sink_new_sink]
   900  
   901  
   902  # Defining a writer for the new sink.
   903  # [START model_custom_sink_new_writer]
   904  class SimpleKVWriter(iobase.Writer):
   905    def __init__(self, simplekv, access_token, table_name):
   906      self._simplekv = simplekv
   907      self._access_token = access_token
   908      self._table_name = table_name
   909      self._table = self._simplekv.open_table(access_token, table_name)
   910  
   911    def write(self, record):
   912      key, value = record
   913  
   914      self._simplekv.write_to_table(self._access_token, self._table, key, value)
   915  
   916    def close(self):
   917      return self._table_name
   918  
   919  
   920  # [END model_custom_sink_new_writer]
   921  
   922  
   923  # [START model_custom_sink_new_ptransform]
   924  class WriteToKVSink(PTransform):
   925    def __init__(self, simplekv, url, final_table_name):
   926      self._simplekv = simplekv
   927      super().__init__()
   928      self._url = url
   929      self._final_table_name = final_table_name
   930  
   931    def expand(self, pcoll):
   932      return pcoll | iobase.Write(
   933          _SimpleKVSink(self._simplekv, self._url, self._final_table_name))
   934  
   935  
   936  # [END model_custom_sink_new_ptransform]
   937  
   938  
   939  # We recommend users to start Sink class names with an underscore to
   940  # discourage using the Sink class directly when a PTransform for the sink is
   941  # available. We simulate that here by simply extending the previous Sink
   942  # class.
   943  class _SimpleKVSink(SimpleKVSink):
   944    pass
   945  
   946  
   947  def model_custom_sink(
   948      simplekv,
   949      KVs,
   950      final_table_name_no_ptransform,
   951      final_table_name_with_ptransform):
   952    """Demonstrates creating a new custom sink and using it in a pipeline.
   953  
   954    Uses the new sink in an example pipeline.
   955  
   956    Additionally demonstrates how a sink should be implemented using a
   957    ``PTransform``. This is the recommended way to develop sinks that are to be
   958    distributed to a large number of end users.
   959  
   960    This method runs two pipelines.
   961  
   962    (1) A pipeline that uses ``SimpleKVSink`` directly using the ``df.Write``
   963        transform.
   964    (2) A pipeline that uses a custom ``PTransform`` that wraps
   965        ``SimpleKVSink``.
   966  
   967    Args:
   968      simplekv: an object that mocks the key-value storage.
   969  
   970      KVs: the set of key-value pairs to be written in the example pipeline.
   971  
   972      final_table_name_no_ptransform: the prefix of final set of tables to be
   973                                      created by the example pipeline that uses
   974                                      ``SimpleKVSink`` directly.
   975  
   976      final_table_name_with_ptransform: the prefix of final set of tables to be
   977                                        created by the example pipeline that uses
   978                                        a ``PTransform`` that wraps
   979                                        ``SimpleKVSink``.
   980    """
   981  
   982    final_table_name = final_table_name_no_ptransform
   983  
   984    # Using the new sink in an example pipeline.
   985    # [START model_custom_sink_use_new_sink]
   986    with beam.Pipeline(options=PipelineOptions()) as pipeline:
   987      kvs = pipeline | 'CreateKVs' >> beam.Create(KVs)
   988  
   989      kvs | 'WriteToSimpleKV' >> beam.io.Write(
   990          SimpleKVSink(simplekv, 'http://url_to_simple_kv/', final_table_name))
   991      # [END model_custom_sink_use_new_sink]
   992  
   993    final_table_name = final_table_name_with_ptransform
   994  
   995    # [START model_custom_sink_use_ptransform]
   996    with beam.Pipeline(options=PipelineOptions()) as pipeline:
   997      kvs = pipeline | 'CreateKVs' >> beam.core.Create(KVs)
   998      kvs | 'WriteToSimpleKV' >> WriteToKVSink(
   999          simplekv, 'http://url_to_simple_kv/', final_table_name)
  1000      # [END model_custom_sink_use_ptransform]
  1001  
  1002  
  1003  def model_textio(renames):
  1004    """Using a Read and Write transform to read/write text files."""
  1005    def filter_words(x):
  1006      import re
  1007      return re.findall(r'[A-Za-z\']+', x)
  1008  
  1009    # [START model_textio_read]
  1010    with beam.Pipeline(options=PipelineOptions()) as pipeline:
  1011      # [START model_pipelineio_read]
  1012      lines = pipeline | 'ReadFromText' >> beam.io.ReadFromText(
  1013          'path/to/input-*.csv')
  1014      # [END model_pipelineio_read]
  1015      # [END model_textio_read]
  1016  
  1017      # [START model_textio_write]
  1018      filtered_words = lines | 'FilterWords' >> beam.FlatMap(filter_words)
  1019      # [START model_pipelineio_write]
  1020      filtered_words | 'WriteToText' >> beam.io.WriteToText(
  1021          '/path/to/numbers', file_name_suffix='.csv')
  1022      # [END model_pipelineio_write]
  1023      # [END model_textio_write]
  1024  
  1025      pipeline.visit(SnippetUtils.RenameFiles(renames))
  1026  
  1027  
  1028  def model_textio_compressed(renames, expected):
  1029    """Using a Read Transform to read compressed text files."""
  1030    with TestPipeline() as pipeline:
  1031  
  1032      # [START model_textio_write_compressed]
  1033      lines = pipeline | 'ReadFromText' >> beam.io.ReadFromText(
  1034          '/path/to/input-*.csv.gz',
  1035          compression_type=beam.io.filesystem.CompressionTypes.GZIP)
  1036      # [END model_textio_write_compressed]
  1037  
  1038      assert_that(lines, equal_to(expected))
  1039      pipeline.visit(SnippetUtils.RenameFiles(renames))
  1040  
  1041  
  1042  def model_datastoreio():
  1043    """Using a Read and Write transform to read/write to Cloud Datastore."""
  1044  
  1045    import uuid
  1046    import apache_beam as beam
  1047    from apache_beam.options.pipeline_options import PipelineOptions
  1048    from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore
  1049    from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore
  1050    from apache_beam.io.gcp.datastore.v1new.types import Entity
  1051    from apache_beam.io.gcp.datastore.v1new.types import Key
  1052    from apache_beam.io.gcp.datastore.v1new.types import Query
  1053  
  1054    project = 'my_project'
  1055    kind = 'my_kind'
  1056    query = Query(kind, project)
  1057  
  1058    # [START model_datastoreio_read]
  1059    pipeline = beam.Pipeline(options=PipelineOptions())
  1060    entities = pipeline | 'Read From Datastore' >> ReadFromDatastore(query)
  1061    # [END model_datastoreio_read]
  1062  
  1063    # [START model_datastoreio_write]
  1064    pipeline = beam.Pipeline(options=PipelineOptions())
  1065    musicians = pipeline | 'Musicians' >> beam.Create(
  1066        ['Mozart', 'Chopin', 'Beethoven', 'Vivaldi'])
  1067  
  1068    def to_entity(content):
  1069      key = Key([kind, str(uuid.uuid4())])
  1070      entity = Entity(key)
  1071      entity.set_properties({'content': content})
  1072      return entity
  1073  
  1074    entities = musicians | 'To Entity' >> beam.Map(to_entity)
  1075    entities | 'Write To Datastore' >> WriteToDatastore(project)
  1076    # [END model_datastoreio_write]
  1077  
  1078  
  1079  def model_bigqueryio(
  1080      pipeline, write_project='', write_dataset='', write_table=''):
  1081    """Using a Read and Write transform to read/write from/to BigQuery."""
  1082  
  1083    # [START model_bigqueryio_table_spec]
  1084    # project-id:dataset_id.table_id
  1085    table_spec = 'clouddataflow-readonly:samples.weather_stations'
  1086    # [END model_bigqueryio_table_spec]
  1087  
  1088    # [START model_bigqueryio_table_spec_without_project]
  1089    # dataset_id.table_id
  1090    table_spec = 'samples.weather_stations'
  1091    # [END model_bigqueryio_table_spec_without_project]
  1092  
  1093    # [START model_bigqueryio_table_spec_object]
  1094    from apache_beam.io.gcp.internal.clients import bigquery
  1095  
  1096    table_spec = bigquery.TableReference(
  1097        projectId='clouddataflow-readonly',
  1098        datasetId='samples',
  1099        tableId='weather_stations')
  1100    # [END model_bigqueryio_table_spec_object]
  1101  
  1102    # [START model_bigqueryio_data_types]
  1103    bigquery_data = [{
  1104        'string': 'abc',
  1105        'bytes': base64.b64encode(b'\xab\xac'),
  1106        'integer': 5,
  1107        'float': 0.5,
  1108        'numeric': Decimal('5'),
  1109        'boolean': True,
  1110        'timestamp': '2018-12-31 12:44:31.744957 UTC',
  1111        'date': '2018-12-31',
  1112        'time': '12:44:31',
  1113        'datetime': '2018-12-31T12:44:31',
  1114        'geography': 'POINT(30 10)'
  1115    }]
  1116    # [END model_bigqueryio_data_types]
  1117  
  1118    # [START model_bigqueryio_read_table]
  1119    max_temperatures = (
  1120        pipeline
  1121        | 'ReadTable' >> beam.io.ReadFromBigQuery(table=table_spec)
  1122        # Each row is a dictionary where the keys are the BigQuery columns
  1123        | beam.Map(lambda elem: elem['max_temperature']))
  1124    # [END model_bigqueryio_read_table]
  1125  
  1126    # [START model_bigqueryio_read_query]
  1127    max_temperatures = (
  1128        pipeline
  1129        | 'QueryTable' >> beam.io.ReadFromBigQuery(
  1130            query='SELECT max_temperature FROM '\
  1131                  '[clouddataflow-readonly:samples.weather_stations]')
  1132        # Each row is a dictionary where the keys are the BigQuery columns
  1133        | beam.Map(lambda elem: elem['max_temperature']))
  1134    # [END model_bigqueryio_read_query]
  1135  
  1136    # [START model_bigqueryio_read_query_std_sql]
  1137    max_temperatures = (
  1138        pipeline
  1139        | 'QueryTableStdSQL' >> beam.io.ReadFromBigQuery(
  1140            query='SELECT max_temperature FROM '\
  1141                  '`clouddataflow-readonly.samples.weather_stations`',
  1142            use_standard_sql=True)
  1143        # Each row is a dictionary where the keys are the BigQuery columns
  1144        | beam.Map(lambda elem: elem['max_temperature']))
  1145    # [END model_bigqueryio_read_query_std_sql]
  1146  
  1147    # [START model_bigqueryio_schema]
  1148    # column_name:BIGQUERY_TYPE, ...
  1149    table_schema = 'source:STRING, quote:STRING'
  1150    # [END model_bigqueryio_schema]
  1151  
  1152    # [START model_bigqueryio_schema_object]
  1153    table_schema = {
  1154        'fields': [{
  1155            'name': 'source', 'type': 'STRING', 'mode': 'NULLABLE'
  1156        }, {
  1157            'name': 'quote', 'type': 'STRING', 'mode': 'REQUIRED'
  1158        }]
  1159    }
  1160    # [END model_bigqueryio_schema_object]
  1161  
  1162    if write_project and write_dataset and write_table:
  1163      table_spec = '{}:{}.{}'.format(write_project, write_dataset, write_table)
  1164  
  1165    # [START model_bigqueryio_write_input]
  1166    quotes = pipeline | beam.Create([
  1167        {
  1168            'source': 'Mahatma Gandhi', 'quote': 'My life is my message.'
  1169        },
  1170        {
  1171            'source': 'Yoda', 'quote': "Do, or do not. There is no 'try'."
  1172        },
  1173    ])
  1174    # [END model_bigqueryio_write_input]
  1175  
  1176    # [START model_bigqueryio_write]
  1177    quotes | beam.io.WriteToBigQuery(
  1178        table_spec,
  1179        schema=table_schema,
  1180        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
  1181        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
  1182    # [END model_bigqueryio_write]
  1183  
  1184    # [START model_bigqueryio_write_dynamic_destinations]
  1185    fictional_characters_view = beam.pvalue.AsDict(
  1186        pipeline | 'CreateCharacters' >> beam.Create([('Yoda', True),
  1187                                                      ('Obi Wan Kenobi', True)]))
  1188  
  1189    def table_fn(element, fictional_characters):
  1190      if element in fictional_characters:
  1191        return 'my_dataset.fictional_quotes'
  1192      else:
  1193        return 'my_dataset.real_quotes'
  1194  
  1195    quotes | 'WriteWithDynamicDestination' >> beam.io.WriteToBigQuery(
  1196        table_fn,
  1197        schema=table_schema,
  1198        table_side_inputs=(fictional_characters_view, ),
  1199        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
  1200        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
  1201    # [END model_bigqueryio_write_dynamic_destinations]
  1202  
  1203    # [START model_bigqueryio_time_partitioning]
  1204    quotes | 'WriteWithTimePartitioning' >> beam.io.WriteToBigQuery(
  1205        table_spec,
  1206        schema=table_schema,
  1207        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
  1208        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
  1209        additional_bq_parameters={'timePartitioning': {
  1210            'type': 'HOUR'
  1211        }})
  1212    # [END model_bigqueryio_time_partitioning]
  1213  
  1214  
  1215  def model_composite_transform_example(contents, output_path):
  1216    """Example of a composite transform.
  1217  
  1218    To declare a composite transform, define a subclass of PTransform.
  1219  
  1220    To override the apply method, define a method "apply" that
  1221    takes a PCollection as its only parameter and returns a PCollection.
  1222    """
  1223    import re
  1224  
  1225    import apache_beam as beam
  1226  
  1227    # [START composite_transform_example]
  1228    # [START composite_ptransform_apply_method]
  1229    # [START composite_ptransform_declare]
  1230    class CountWords(beam.PTransform):
  1231      # [END composite_ptransform_declare]
  1232  
  1233      def expand(self, pcoll):
  1234        return (
  1235            pcoll
  1236            | beam.FlatMap(lambda x: re.findall(r'\w+', x))
  1237            | beam.combiners.Count.PerElement()
  1238            | beam.Map(lambda word_c: '%s: %s' % (word_c[0], word_c[1])))
  1239  
  1240    # [END composite_ptransform_apply_method]
  1241    # [END composite_transform_example]
  1242  
  1243    with TestPipeline() as pipeline:  # Use TestPipeline for testing.
  1244      (
  1245          pipeline
  1246          | beam.Create(contents)
  1247          | CountWords()
  1248          | beam.io.WriteToText(output_path))
  1249  
  1250  
  1251  def model_multiple_pcollections_flatten(contents, output_path):
  1252    """Merging a PCollection with Flatten."""
  1253    some_hash_fn = lambda s: ord(s[0])
  1254    partition_fn = lambda element, partitions: some_hash_fn(element) % partitions
  1255    import apache_beam as beam
  1256    with TestPipeline() as pipeline:  # Use TestPipeline for testing.
  1257  
  1258      # Partition into deciles
  1259      partitioned = pipeline | beam.Create(contents) | beam.Partition(
  1260          partition_fn, 3)
  1261      pcoll1 = partitioned[0]
  1262      pcoll2 = partitioned[1]
  1263      pcoll3 = partitioned[2]
  1264  
  1265      # Flatten them back into 1
  1266  
  1267      # A collection of PCollection objects can be represented simply
  1268      # as a tuple (or list) of PCollections.
  1269      # (The SDK for Python has no separate type to store multiple
  1270      # PCollection objects, whether containing the same or different
  1271      # types.)
  1272      # [START model_multiple_pcollections_flatten]
  1273      merged = (
  1274          (pcoll1, pcoll2, pcoll3)
  1275          # A list of tuples can be "piped" directly into a Flatten transform.
  1276          | beam.Flatten())
  1277      # [END model_multiple_pcollections_flatten]
  1278      merged | beam.io.WriteToText(output_path)
  1279  
  1280  
  1281  def model_multiple_pcollections_partition(contents, output_path):
  1282    """Splitting a PCollection with Partition."""
  1283    some_hash_fn = lambda s: ord(s[0])
  1284  
  1285    def get_percentile(i):
  1286      """Assume i in [0,100)."""
  1287      return i
  1288  
  1289    import apache_beam as beam
  1290    with TestPipeline() as pipeline:  # Use TestPipeline for testing.
  1291  
  1292      students = pipeline | beam.Create(contents)
  1293  
  1294      # [START model_multiple_pcollections_partition]
  1295      def partition_fn(student, num_partitions):
  1296        return int(get_percentile(student) * num_partitions / 100)
  1297  
  1298      by_decile = students | beam.Partition(partition_fn, 10)
  1299      # [END model_multiple_pcollections_partition]
  1300      # [START model_multiple_pcollections_partition_40th]
  1301      fortieth_percentile = by_decile[4]
  1302      # [END model_multiple_pcollections_partition_40th]
  1303  
  1304      ([by_decile[d] for d in range(10) if d != 4] + [fortieth_percentile]
  1305       | beam.Flatten()
  1306       | beam.io.WriteToText(output_path))
  1307  
  1308  
  1309  def model_group_by_key(contents, output_path):
  1310    """Applying a GroupByKey Transform."""
  1311    import re
  1312  
  1313    import apache_beam as beam
  1314    with TestPipeline() as pipeline:  # Use TestPipeline for testing.
  1315  
  1316      def count_ones(word_ones):
  1317        (word, ones) = word_ones
  1318        return (word, sum(ones))
  1319  
  1320      words_and_counts = (
  1321          pipeline
  1322          | beam.Create(contents)
  1323          | beam.FlatMap(lambda x: re.findall(r'\w+', x))
  1324          | 'one word' >> beam.Map(lambda w: (w, 1)))
  1325      # GroupByKey accepts a PCollection of (w, 1) and
  1326      # outputs a PCollection of (w, (1, 1, ...)).
  1327      # (A key/value pair is just a tuple in Python.)
  1328      # This is a somewhat forced example, since one could
  1329      # simply use beam.combiners.Count.PerElement here.
  1330      # [START model_group_by_key_transform]
  1331      grouped_words = words_and_counts | beam.GroupByKey()
  1332      # [END model_group_by_key_transform]
  1333      (
  1334          grouped_words
  1335          | 'count words' >> beam.Map(count_ones)
  1336          | beam.io.WriteToText(output_path))
  1337  
  1338  
  1339  def model_co_group_by_key_tuple(emails, phones, output_path):
  1340    """Applying a CoGroupByKey Transform to a tuple."""
  1341    import apache_beam as beam
  1342    # [START model_group_by_key_cogroupbykey_tuple]
  1343    # The result PCollection contains one key-value element for each key in the
  1344    # input PCollections. The key of the pair will be the key from the input and
  1345    # the value will be a dictionary with two entries: 'emails' - an iterable of
  1346    # all values for the current key in the emails PCollection and 'phones': an
  1347    # iterable of all values for the current key in the phones PCollection.
  1348    results = ({'emails': emails, 'phones': phones} | beam.CoGroupByKey())
  1349  
  1350    def join_info(name_info):
  1351      (name, info) = name_info
  1352      return '%s; %s; %s' %\
  1353          (name, sorted(info['emails']), sorted(info['phones']))
  1354  
  1355    contact_lines = results | beam.Map(join_info)
  1356    # [END model_group_by_key_cogroupbykey_tuple]
  1357    contact_lines | beam.io.WriteToText(output_path)
  1358  
  1359  
  1360  def model_join_using_side_inputs(
  1361      name_list, email_list, phone_list, output_path):
  1362    """Joining PCollections using side inputs."""
  1363  
  1364    import apache_beam as beam
  1365    from apache_beam.pvalue import AsIter
  1366  
  1367    with TestPipeline() as pipeline:  # Use TestPipeline for testing.
  1368      # [START model_join_using_side_inputs]
  1369      # This code performs a join by receiving the set of names as an input and
  1370      # passing PCollections that contain emails and phone numbers as side inputs
  1371      # instead of using CoGroupByKey.
  1372      names = pipeline | 'names' >> beam.Create(name_list)
  1373      emails = pipeline | 'email' >> beam.Create(email_list)
  1374      phones = pipeline | 'phone' >> beam.Create(phone_list)
  1375  
  1376      def join_info(name, emails, phone_numbers):
  1377        filtered_emails = []
  1378        for name_in_list, email in emails:
  1379          if name_in_list == name:
  1380            filtered_emails.append(email)
  1381  
  1382        filtered_phone_numbers = []
  1383        for name_in_list, phone_number in phone_numbers:
  1384          if name_in_list == name:
  1385            filtered_phone_numbers.append(phone_number)
  1386  
  1387        return '; '.join([
  1388            '%s' % name,
  1389            '%s' % ','.join(filtered_emails),
  1390            '%s' % ','.join(filtered_phone_numbers)
  1391        ])
  1392  
  1393      contact_lines = names | 'CreateContacts' >> beam.core.Map(
  1394          join_info, AsIter(emails), AsIter(phones))
  1395      # [END model_join_using_side_inputs]
  1396      contact_lines | beam.io.WriteToText(output_path)
  1397  
  1398  
  1399  # [START model_library_transforms_keys]
  1400  class Keys(beam.PTransform):
  1401    def expand(self, pcoll):
  1402      return pcoll | 'Keys' >> beam.Map(lambda k_v: k_v[0])
  1403  
  1404  
  1405  # [END model_library_transforms_keys]
  1406  # pylint: enable=invalid-name
  1407  
  1408  
  1409  # [START model_library_transforms_count]
  1410  class Count(beam.PTransform):
  1411    def expand(self, pcoll):
  1412      return (
  1413          pcoll
  1414          | 'PairWithOne' >> beam.Map(lambda v: (v, 1))
  1415          | beam.CombinePerKey(sum))
  1416  
  1417  
  1418  # [END model_library_transforms_count]
  1419  
  1420  
  1421  def file_process_pattern_access_metadata():
  1422  
  1423    import apache_beam as beam
  1424    from apache_beam.io import fileio
  1425  
  1426    # [START FileProcessPatternAccessMetadataSnip1]
  1427    with beam.Pipeline() as pipeline:
  1428      readable_files = (
  1429          pipeline
  1430          | fileio.MatchFiles('hdfs://path/to/*.txt')
  1431          | fileio.ReadMatches()
  1432          | beam.Reshuffle())
  1433      files_and_contents = (
  1434          readable_files
  1435          | beam.Map(lambda x: (x.metadata.path, x.read_utf8())))
  1436    # [END FileProcessPatternAccessMetadataSnip1]
  1437  
  1438  
  1439  def accessing_valueprovider_info_after_run():
  1440    # [START AccessingValueProviderInfoAfterRunSnip1]
  1441    import logging
  1442  
  1443    import apache_beam as beam
  1444    from apache_beam.options.pipeline_options import PipelineOptions
  1445    from apache_beam.options.value_provider import RuntimeValueProvider
  1446  
  1447    class MyOptions(PipelineOptions):
  1448      @classmethod
  1449      def _add_argparse_args(cls, parser):
  1450        parser.add_value_provider_argument('--string_value', type=str)
  1451  
  1452    class LogValueProvidersFn(beam.DoFn):
  1453      def __init__(self, string_vp):
  1454        self.string_vp = string_vp
  1455  
  1456      # Define the DoFn that logs the ValueProvider value.
  1457      # The DoFn is called when creating the pipeline branch.
  1458      # This example logs the ValueProvider value, but
  1459      # you could store it by pushing it to an external database.
  1460      def process(self, an_int):
  1461        logging.info('The string_value is %s' % self.string_vp.get())
  1462        # Another option (where you don't need to pass the value at all) is:
  1463        logging.info(
  1464            'The string value is %s' %
  1465            RuntimeValueProvider.get_value('string_value', str, ''))
  1466  
  1467    beam_options = PipelineOptions()
  1468    args = beam_options.view_as(MyOptions)
  1469  
  1470    # Create pipeline.
  1471    with beam.Pipeline(options=beam_options) as pipeline:
  1472  
  1473      # Add a branch for logging the ValueProvider value.
  1474      _ = (
  1475          pipeline
  1476          | beam.Create([None])
  1477          | 'LogValueProvs' >> beam.ParDo(LogValueProvidersFn(args.string_value)))
  1478  
  1479      # The main pipeline.
  1480      result_pc = (
  1481          pipeline
  1482          | "main_pc" >> beam.Create([1, 2, 3])
  1483          | beam.combiners.Sum.Globally())
  1484  
  1485    # [END AccessingValueProviderInfoAfterRunSnip1]
  1486  
  1487  
  1488  def side_input_slow_update(
  1489      src_file_pattern,
  1490      first_timestamp,
  1491      last_timestamp,
  1492      interval,
  1493      sample_main_input_elements,
  1494      main_input_windowing_interval):
  1495    # [START SideInputSlowUpdateSnip1]
  1496    from apache_beam.transforms.periodicsequence import PeriodicImpulse
  1497    from apache_beam.transforms.window import TimestampedValue
  1498    from apache_beam.transforms import window
  1499  
  1500    # from apache_beam.utils.timestamp import MAX_TIMESTAMP
  1501    # last_timestamp = MAX_TIMESTAMP to go on indefninitely
  1502  
  1503    # Any user-defined function.
  1504    # cross join is used as an example.
  1505    def cross_join(left, rights):
  1506      for x in rights:
  1507        yield (left, x)
  1508  
  1509    # Create pipeline.
  1510    pipeline = beam.Pipeline()
  1511    side_input = (
  1512        pipeline
  1513        | 'PeriodicImpulse' >> PeriodicImpulse(
  1514            first_timestamp, last_timestamp, interval, True)
  1515        | 'MapToFileName' >> beam.Map(lambda x: src_file_pattern + str(x))
  1516        | 'ReadFromFile' >> beam.io.ReadAllFromText())
  1517  
  1518    main_input = (
  1519        pipeline
  1520        | 'MpImpulse' >> beam.Create(sample_main_input_elements)
  1521        |
  1522        'MapMpToTimestamped' >> beam.Map(lambda src: TimestampedValue(src, src))
  1523        | 'WindowMpInto' >> beam.WindowInto(
  1524            window.FixedWindows(main_input_windowing_interval)))
  1525  
  1526    result = (
  1527        main_input
  1528        | 'ApplyCrossJoin' >> beam.FlatMap(
  1529            cross_join, rights=beam.pvalue.AsIter(side_input)))
  1530    # [END SideInputSlowUpdateSnip1]
  1531  
  1532    return pipeline, result
  1533  
  1534  
  1535  def bigqueryio_deadletter():
  1536    # [START BigQueryIODeadLetter]
  1537  
  1538    # Create pipeline.
  1539    schema = ({'fields': [{'name': 'a', 'type': 'STRING', 'mode': 'REQUIRED'}]})
  1540  
  1541    pipeline = beam.Pipeline()
  1542  
  1543    errors = (
  1544        pipeline | 'Data' >> beam.Create([1, 2])
  1545        | 'CreateBrokenData' >>
  1546        beam.Map(lambda src: {'a': src} if src == 2 else {'a': None})
  1547        | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
  1548            "<Your Project:Test.dummy_a_table",
  1549            schema=schema,
  1550            insert_retry_strategy='RETRY_ON_TRANSIENT_ERROR',
  1551            create_disposition='CREATE_IF_NEEDED',
  1552            write_disposition='WRITE_APPEND'))
  1553    result = (
  1554        errors['FailedRows']
  1555        | 'PrintErrors' >>
  1556        beam.FlatMap(lambda err: print("Error Found {}".format(err))))
  1557    # [END BigQueryIODeadLetter]
  1558  
  1559    return result
  1560  
  1561  
  1562  def extract_sentiments(response):
  1563    # [START nlp_extract_sentiments]
  1564    return {
  1565        'sentences': [{
  1566            sentence.text.content: sentence.sentiment.score
  1567        } for sentence in response.sentences],
  1568        'document_sentiment': response.document_sentiment.score,
  1569    }
  1570    # [END nlp_extract_sentiments]
  1571  
  1572  
  1573  def extract_entities(response):
  1574    # [START nlp_extract_entities]
  1575    return [{
  1576        'name': entity.name,
  1577        'type': nlp.enums.Entity.Type(entity.type).name,
  1578    } for entity in response.entities]
  1579    # [END nlp_extract_entities]
  1580  
  1581  
  1582  def analyze_dependency_tree(response):
  1583    # [START analyze_dependency_tree]
  1584    from collections import defaultdict
  1585    adjacency_lists = []
  1586  
  1587    index = 0
  1588    for sentence in response.sentences:
  1589      adjacency_list = defaultdict(list)
  1590      sentence_begin = sentence.text.begin_offset
  1591      sentence_end = sentence_begin + len(sentence.text.content) - 1
  1592  
  1593      while index < len(response.tokens) and \
  1594          response.tokens[index].text.begin_offset <= sentence_end:
  1595        token = response.tokens[index]
  1596        head_token_index = token.dependency_edge.head_token_index
  1597        head_token_text = response.tokens[head_token_index].text.content
  1598        adjacency_list[head_token_text].append(token.text.content)
  1599        index += 1
  1600      adjacency_lists.append(adjacency_list)
  1601    # [END analyze_dependency_tree]
  1602  
  1603    return adjacency_lists
  1604  
  1605  
  1606  def nlp_analyze_text():
  1607    # [START nlp_analyze_text]
  1608    features = nlp.types.AnnotateTextRequest.Features(
  1609        extract_entities=True,
  1610        extract_document_sentiment=True,
  1611        extract_entity_sentiment=True,
  1612        extract_syntax=True,
  1613    )
  1614  
  1615    with beam.Pipeline() as pipeline:
  1616      responses = (
  1617          pipeline
  1618          | beam.Create([
  1619              'My experience so far has been fantastic! '
  1620              'I\'d really recommend this product.'
  1621          ])
  1622          | beam.Map(lambda x: nlp.Document(x, type='PLAIN_TEXT'))
  1623          | nlp.AnnotateText(features))
  1624  
  1625      _ = (
  1626          responses
  1627          | beam.Map(extract_sentiments)
  1628          | 'Parse sentiments to JSON' >> beam.Map(json.dumps)
  1629          | 'Write sentiments' >> beam.io.WriteToText('sentiments.txt'))
  1630  
  1631      _ = (
  1632          responses
  1633          | beam.Map(extract_entities)
  1634          | 'Parse entities to JSON' >> beam.Map(json.dumps)
  1635          | 'Write entities' >> beam.io.WriteToText('entities.txt'))
  1636  
  1637      _ = (
  1638          responses
  1639          | beam.Map(analyze_dependency_tree)
  1640          | 'Parse adjacency list to JSON' >> beam.Map(json.dumps)
  1641          | 'Write adjacency list' >> beam.io.WriteToText('adjancency_list.txt'))
  1642    # [END nlp_analyze_text]
  1643  
  1644  
  1645  def sdf_basic_example():
  1646    import os
  1647    from apache_beam.io.restriction_trackers import OffsetRange
  1648    read_next_record = None
  1649  
  1650    # [START SDF_BasicExample]
  1651    class FileToWordsRestrictionProvider(beam.transforms.core.RestrictionProvider
  1652                                         ):
  1653      def initial_restriction(self, file_name):
  1654        return OffsetRange(0, os.stat(file_name).st_size)
  1655  
  1656      def create_tracker(self, restriction):
  1657        return beam.io.restriction_trackers.OffsetRestrictionTracker()
  1658  
  1659    class FileToWordsFn(beam.DoFn):
  1660      def process(
  1661          self,
  1662          file_name,
  1663          # Alternatively, we can let FileToWordsFn itself inherit from
  1664          # RestrictionProvider, implement the required methods and let
  1665          # tracker=beam.DoFn.RestrictionParam() which will use self as
  1666          # the provider.
  1667          tracker=beam.DoFn.RestrictionParam(FileToWordsRestrictionProvider())):
  1668        with open(file_name) as file_handle:
  1669          file_handle.seek(tracker.current_restriction.start())
  1670          while tracker.try_claim(file_handle.tell()):
  1671            yield read_next_record(file_handle)
  1672  
  1673      # Providing the coder is only necessary if it can not be inferred at
  1674      # runtime.
  1675      def restriction_coder(self):
  1676        return ...
  1677  
  1678    # [END SDF_BasicExample]
  1679  
  1680  
  1681  def sdf_basic_example_with_splitting():
  1682    from apache_beam.io.restriction_trackers import OffsetRange
  1683  
  1684    # [START SDF_BasicExampleWithSplitting]
  1685    class FileToWordsRestrictionProvider(beam.transforms.core.RestrictionProvider
  1686                                         ):
  1687      def split(self, file_name, restriction):
  1688        # Compute and output 64 MiB size ranges to process in parallel
  1689        split_size = 64 * (1 << 20)
  1690        i = restriction.start
  1691        while i < restriction.end - split_size:
  1692          yield OffsetRange(i, i + split_size)
  1693          i += split_size
  1694        yield OffsetRange(i, restriction.end)
  1695  
  1696    # [END SDF_BasicExampleWithSplitting]
  1697  
  1698  
  1699  def sdf_sdk_initiated_checkpointing():
  1700    timestamp = None
  1701    external_service = None
  1702  
  1703    class MyRestrictionProvider(object):
  1704      pass
  1705  
  1706    # [START SDF_UserInitiatedCheckpoint]
  1707    class MySplittableDoFn(beam.DoFn):
  1708      def process(
  1709          self,
  1710          element,
  1711          restriction_tracker=beam.DoFn.RestrictionParam(
  1712              MyRestrictionProvider())):
  1713        current_position = restriction_tracker.current_restriction.start()
  1714        while True:
  1715          # Pull records from an external service.
  1716          try:
  1717            records = external_service.fetch(current_position)
  1718            if records.empty():
  1719              # Set a shorter delay in case we are being throttled.
  1720              restriction_tracker.defer_remainder(timestamp.Duration(second=10))
  1721              return
  1722            for record in records:
  1723              if restriction_tracker.try_claim(record.position):
  1724                current_position = record.position
  1725                yield record
  1726              else:
  1727                return
  1728          except TimeoutError:
  1729            # Set a longer delay in case we are being throttled.
  1730            restriction_tracker.defer_remainder(timestamp.Duration(seconds=60))
  1731            return
  1732  
  1733    # [END SDF_UserInitiatedCheckpoint]
  1734  
  1735  
  1736  def sdf_get_size():
  1737    # [START SDF_GetSize]
  1738    # The RestrictionProvider is responsible for calculating the size of given
  1739    # restriction.
  1740    class MyRestrictionProvider(beam.transforms.core.RestrictionProvider):
  1741      def restriction_size(self, file_name, restriction):
  1742        weight = 2 if "expensiveRecords" in file_name else 1
  1743        return restriction.size() * weight
  1744  
  1745    # [END SDF_GetSize]
  1746  
  1747  
  1748  def sdf_bad_try_claim_loop():
  1749    class FileToWordsRestrictionProvider(object):
  1750      pass
  1751  
  1752    read_next_record = None
  1753  
  1754    # [START SDF_BadTryClaimLoop]
  1755    class BadTryClaimLoop(beam.DoFn):
  1756      def process(
  1757          self,
  1758          file_name,
  1759          tracker=beam.DoFn.RestrictionParam(FileToWordsRestrictionProvider())):
  1760        with open(file_name) as file_handle:
  1761          file_handle.seek(tracker.current_restriction.start())
  1762          # The restriction tracker can be modified by another thread in parallel
  1763          # so storing state locally is ill advised.
  1764          end = tracker.current_restriction.end()
  1765          while file_handle.tell() < end:
  1766            # Only after successfully claiming should we produce any output and/or
  1767            # perform side effects.
  1768            tracker.try_claim(file_handle.tell())
  1769            yield read_next_record(file_handle)
  1770  
  1771    # [END SDF_BadTryClaimLoop]
  1772  
  1773  
  1774  def sdf_custom_watermark_estimator():
  1775    from apache_beam.io.iobase import WatermarkEstimator
  1776    from apache_beam.transforms.core import WatermarkEstimatorProvider
  1777    current_watermark = None
  1778  
  1779    class MyRestrictionProvider(object):
  1780      pass
  1781  
  1782    # [START SDF_CustomWatermarkEstimator]
  1783    # (Optional) Define a custom watermark state type to save information between
  1784    # bundle processing rounds.
  1785    class MyCustomerWatermarkEstimatorState(object):
  1786      def __init__(self, element, restriction):
  1787        # Store data necessary for future watermark computations
  1788        pass
  1789  
  1790    # Define a WatermarkEstimator
  1791    class MyCustomWatermarkEstimator(WatermarkEstimator):
  1792      def __init__(self, estimator_state):
  1793        self.state = estimator_state
  1794  
  1795      def observe_timestamp(self, timestamp):
  1796        # Will be invoked on each output from the SDF
  1797        pass
  1798  
  1799      def current_watermark(self):
  1800        # Return a monotonically increasing value
  1801        return current_watermark
  1802  
  1803      def get_estimator_state(self):
  1804        # Return state to resume future watermark estimation after a
  1805        # checkpoint/split
  1806        return self.state
  1807  
  1808    # Then, a WatermarkEstimatorProvider needs to be created for this
  1809    # WatermarkEstimator
  1810    class MyWatermarkEstimatorProvider(WatermarkEstimatorProvider):
  1811      def initial_estimator_state(self, element, restriction):
  1812        return MyCustomerWatermarkEstimatorState(element, restriction)
  1813  
  1814      def create_watermark_estimator(self, estimator_state):
  1815        return MyCustomWatermarkEstimator(estimator_state)
  1816  
  1817    # Finally, define the SDF using your estimator.
  1818    class MySplittableDoFn(beam.DoFn):
  1819      def process(
  1820          self,
  1821          element,
  1822          restriction_tracker=beam.DoFn.RestrictionParam(MyRestrictionProvider()),
  1823          watermark_estimator=beam.DoFn.WatermarkEstimatorParam(
  1824              MyWatermarkEstimatorProvider())):
  1825        # The current watermark can be inspected.
  1826        watermark_estimator.current_watermark()
  1827  
  1828    # [END SDF_CustomWatermarkEstimator]
  1829  
  1830  
  1831  def sdf_truncate():
  1832    # [START SDF_Truncate]
  1833    class MyRestrictionProvider(beam.transforms.core.RestrictionProvider):
  1834      def truncate(self, file_name, restriction):
  1835        if "optional" in file_name:
  1836          # Skip optional files
  1837          return None
  1838        return restriction
  1839  
  1840    # [END SDF_Truncate]
  1841  
  1842  
  1843  def bundle_finalize():
  1844    my_callback_func = None
  1845  
  1846    # [START BundleFinalize]
  1847    class MySplittableDoFn(beam.DoFn):
  1848      def process(self, element, bundle_finalizer=beam.DoFn.BundleFinalizerParam):
  1849        # ... produce output ...
  1850  
  1851        # Register callback function for this bundle that performs the side
  1852        # effect.
  1853        bundle_finalizer.register(my_callback_func)
  1854  
  1855    # [END BundleFinalize]