github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/snippets/snippets_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/snippets/snippets_test.py (about)

     1  # coding=utf-8
     2  #
     3  # Licensed to the Apache Software Foundation (ASF) under one or more
     4  # contributor license agreements.  See the NOTICE file distributed with
     5  # this work for additional information regarding copyright ownership.
     6  # The ASF licenses this file to You under the Apache License, Version 2.0
     7  # (the "License"); you may not use this file except in compliance with
     8  # the License.  You may obtain a copy of the License at
     9  #
    10  #    http://www.apache.org/licenses/LICENSE-2.0
    11  #
    12  # Unless required by applicable law or agreed to in writing, software
    13  # distributed under the License is distributed on an "AS IS" BASIS,
    14  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15  # See the License for the specific language governing permissions and
    16  # limitations under the License.
    17  #
    18  
    19  """Tests for all code snippets used in public docs."""
    20  # pytype: skip-file
    21  
    22  import gc
    23  import glob
    24  import gzip
    25  import logging
    26  import math
    27  import os
    28  import sys
    29  import tempfile
    30  import time
    31  import unittest
    32  import uuid
    33  
    34  import mock
    35  import parameterized
    36  
    37  import apache_beam as beam
    38  from apache_beam import WindowInto
    39  from apache_beam import coders
    40  from apache_beam import pvalue
    41  from apache_beam import typehints
    42  from apache_beam.coders.coders import ToBytesCoder
    43  from apache_beam.examples.snippets import snippets
    44  from apache_beam.metrics import Metrics
    45  from apache_beam.metrics.metric import MetricsFilter
    46  from apache_beam.options.pipeline_options import GoogleCloudOptions
    47  from apache_beam.options.pipeline_options import PipelineOptions
    48  from apache_beam.options.pipeline_options import StandardOptions
    49  from apache_beam.testing.test_pipeline import TestPipeline
    50  from apache_beam.testing.test_stream import TestStream
    51  from apache_beam.testing.util import assert_that
    52  from apache_beam.testing.util import equal_to
    53  from apache_beam.transforms import combiners
    54  from apache_beam.transforms.trigger import AccumulationMode
    55  from apache_beam.transforms.trigger import AfterAny
    56  from apache_beam.transforms.trigger import AfterCount
    57  from apache_beam.transforms.trigger import AfterProcessingTime
    58  from apache_beam.transforms.trigger import AfterWatermark
    59  from apache_beam.transforms.trigger import Repeatedly
    60  from apache_beam.transforms.window import FixedWindows
    61  from apache_beam.transforms.window import TimestampedValue
    62  from apache_beam.utils.windowed_value import WindowedValue
    63  
    64  # Protect against environments where apitools library is not available.
    65  # pylint: disable=wrong-import-order, wrong-import-position
    66  try:
    67    from apitools.base.py import base_api
    68  except ImportError:
    69    base_api = None
    70  # pylint: enable=wrong-import-order, wrong-import-position
    71  
    72  # Protect against environments where datastore library is not available.
    73  # pylint: disable=wrong-import-order, wrong-import-position
    74  try:
    75    from google.cloud.datastore import client as datastore_client
    76  except ImportError:
    77    datastore_client = None
    78  # pylint: enable=wrong-import-order, wrong-import-position
    79  
    80  # Protect against environments where the PubSub library is not available.
    81  # pylint: disable=wrong-import-order, wrong-import-position
    82  try:
    83    from google.cloud import pubsub
    84  except ImportError:
    85    pubsub = None
    86  # pylint: enable=wrong-import-order, wrong-import-position
    87  
    88  
    89  class ParDoTest(unittest.TestCase):
    90    """Tests for model/par-do."""
    91    def test_pardo(self):
    92      # Note: "words" and "ComputeWordLengthFn" are referenced by name in
    93      # the text of the doc.
    94  
    95      words = ['aa', 'bbb', 'c']
    96  
    97      # [START model_pardo_pardo]
    98      class ComputeWordLengthFn(beam.DoFn):
    99        def process(self, element):
   100          return [len(element)]
   101  
   102      # [END model_pardo_pardo]
   103  
   104      # [START model_pardo_apply]
   105      # Apply a ParDo to the PCollection "words" to compute lengths for each word.
   106      word_lengths = words | beam.ParDo(ComputeWordLengthFn())
   107      # [END model_pardo_apply]
   108      self.assertEqual({2, 3, 1}, set(word_lengths))
   109  
   110    def test_pardo_yield(self):
   111      words = ['aa', 'bbb', 'c']
   112  
   113      # [START model_pardo_yield]
   114      class ComputeWordLengthFn(beam.DoFn):
   115        def process(self, element):
   116          yield len(element)
   117  
   118      # [END model_pardo_yield]
   119  
   120      word_lengths = words | beam.ParDo(ComputeWordLengthFn())
   121      self.assertEqual({2, 3, 1}, set(word_lengths))
   122  
   123    def test_pardo_using_map(self):
   124      words = ['aa', 'bbb', 'c']
   125      # [START model_pardo_using_map]
   126      word_lengths = words | beam.Map(len)
   127      # [END model_pardo_using_map]
   128  
   129      self.assertEqual({2, 3, 1}, set(word_lengths))
   130  
   131    def test_pardo_using_flatmap(self):
   132      words = ['aa', 'bbb', 'c']
   133      # [START model_pardo_using_flatmap]
   134      word_lengths = words | beam.FlatMap(lambda word: [len(word)])
   135      # [END model_pardo_using_flatmap]
   136  
   137      self.assertEqual({2, 3, 1}, set(word_lengths))
   138  
   139    def test_pardo_using_flatmap_yield(self):
   140      words = ['aA', 'bbb', 'C']
   141  
   142      # [START model_pardo_using_flatmap_yield]
   143      def capitals(word):
   144        for letter in word:
   145          if 'A' <= letter <= 'Z':
   146            yield letter
   147  
   148      all_capitals = words | beam.FlatMap(capitals)
   149      # [END model_pardo_using_flatmap_yield]
   150  
   151      self.assertEqual({'A', 'C'}, set(all_capitals))
   152  
   153    def test_pardo_with_label(self):
   154      words = ['aa', 'bbc', 'defg']
   155      # [START model_pardo_with_label]
   156      result = words | 'CountUniqueLetters' >> beam.Map(
   157          lambda word: len(set(word)))
   158      # [END model_pardo_with_label]
   159  
   160      self.assertEqual({1, 2, 4}, set(result))
   161  
   162    def test_pardo_side_input(self):
   163      # pylint: disable=line-too-long
   164      with TestPipeline() as p:
   165        words = p | 'start' >> beam.Create(['a', 'bb', 'ccc', 'dddd'])
   166  
   167        # [START model_pardo_side_input]
   168        # Callable takes additional arguments.
   169        def filter_using_length(word, lower_bound, upper_bound=float('inf')):
   170          if lower_bound <= len(word) <= upper_bound:
   171            yield word
   172  
   173        # Construct a deferred side input.
   174        avg_word_len = (
   175            words
   176            | beam.Map(len)
   177            | beam.CombineGlobally(beam.combiners.MeanCombineFn()))
   178  
   179        # Call with explicit side inputs.
   180        small_words = words | 'small' >> beam.FlatMap(filter_using_length, 0, 3)
   181  
   182        # A single deferred side input.
   183        larger_than_average = (
   184            words | 'large' >> beam.FlatMap(
   185                filter_using_length, lower_bound=pvalue.AsSingleton(avg_word_len))
   186        )
   187  
   188        # Mix and match.
   189        small_but_nontrivial = words | beam.FlatMap(
   190            filter_using_length,
   191            lower_bound=2,
   192            upper_bound=pvalue.AsSingleton(avg_word_len))
   193        # [END model_pardo_side_input]
   194  
   195        assert_that(small_words, equal_to(['a', 'bb', 'ccc']))
   196        assert_that(
   197            larger_than_average,
   198            equal_to(['ccc', 'dddd']),
   199            label='larger_than_average')
   200        assert_that(
   201            small_but_nontrivial, equal_to(['bb']), label='small_but_not_trivial')
   202  
   203    def test_pardo_side_input_dofn(self):
   204      words = ['a', 'bb', 'ccc', 'dddd']
   205  
   206      # [START model_pardo_side_input_dofn]
   207      class FilterUsingLength(beam.DoFn):
   208        def process(self, element, lower_bound, upper_bound=float('inf')):
   209          if lower_bound <= len(element) <= upper_bound:
   210            yield element
   211  
   212      small_words = words | beam.ParDo(FilterUsingLength(), 0, 3)
   213      # [END model_pardo_side_input_dofn]
   214      self.assertEqual({'a', 'bb', 'ccc'}, set(small_words))
   215  
   216    def test_pardo_with_tagged_outputs(self):
   217      # [START model_pardo_emitting_values_on_tagged_outputs]
   218      class ProcessWords(beam.DoFn):
   219        def process(self, element, cutoff_length, marker):
   220          if len(element) <= cutoff_length:
   221            # Emit this short word to the main output.
   222            yield element
   223          else:
   224            # Emit this word's long length to the 'above_cutoff_lengths' output.
   225            yield pvalue.TaggedOutput('above_cutoff_lengths', len(element))
   226          if element.startswith(marker):
   227            # Emit this word to a different output with the 'marked strings' tag.
   228            yield pvalue.TaggedOutput('marked strings', element)
   229  
   230      # [END model_pardo_emitting_values_on_tagged_outputs]
   231  
   232      words = ['a', 'an', 'the', 'music', 'xyz']
   233  
   234      # [START model_pardo_with_tagged_outputs]
   235      results = (
   236          words
   237          | beam.ParDo(ProcessWords(), cutoff_length=2, marker='x').with_outputs(
   238              'above_cutoff_lengths',
   239              'marked strings',
   240              main='below_cutoff_strings'))
   241      below = results.below_cutoff_strings
   242      above = results.above_cutoff_lengths
   243      marked = results['marked strings']  # indexing works as well
   244      # [END model_pardo_with_tagged_outputs]
   245  
   246      self.assertEqual({'a', 'an'}, set(below))
   247      self.assertEqual({3, 5}, set(above))
   248      self.assertEqual({'xyz'}, set(marked))
   249  
   250      # [START model_pardo_with_tagged_outputs_iter]
   251      below, above, marked = (words
   252                              | beam.ParDo(
   253                                  ProcessWords(), cutoff_length=2, marker='x')
   254                              .with_outputs('above_cutoff_lengths',
   255                                            'marked strings',
   256                                            main='below_cutoff_strings'))
   257      # [END model_pardo_with_tagged_outputs_iter]
   258  
   259      self.assertEqual({'a', 'an'}, set(below))
   260      self.assertEqual({3, 5}, set(above))
   261      self.assertEqual({'xyz'}, set(marked))
   262  
   263    def test_pardo_with_undeclared_outputs(self):
   264      # Note: the use of undeclared outputs is currently not supported in eager
   265      # execution mode.
   266      with TestPipeline() as p:
   267        numbers = p | beam.Create([1, 2, 3, 4, 5, 10, 20])
   268  
   269        # [START model_pardo_with_undeclared_outputs]
   270        def even_odd(x):
   271          yield pvalue.TaggedOutput('odd' if x % 2 else 'even', x)
   272          if x % 10 == 0:
   273            yield x
   274  
   275        results = numbers | beam.FlatMap(even_odd).with_outputs()
   276  
   277        evens = results.even
   278        odds = results.odd
   279        tens = results[None]  # the undeclared main output
   280        # [END model_pardo_with_undeclared_outputs]
   281  
   282        assert_that(evens, equal_to([2, 4, 10, 20]), label='assert_even')
   283        assert_that(odds, equal_to([1, 3, 5]), label='assert_odds')
   284        assert_that(tens, equal_to([10, 20]), label='assert_tens')
   285  
   286  
   287  class TypeHintsTest(unittest.TestCase):
   288    def test_bad_types(self):
   289      # [START type_hints_missing_define_numbers]
   290      p = TestPipeline()
   291  
   292      numbers = p | beam.Create(['1', '2', '3'])
   293      # [END type_hints_missing_define_numbers]
   294  
   295      # Consider the following code.
   296      # pylint: disable=expression-not-assigned
   297      # pylint: disable=unused-variable
   298      # [START type_hints_missing_apply]
   299      evens = numbers | beam.Filter(lambda x: x % 2 == 0)
   300      # [END type_hints_missing_apply]
   301  
   302      # Now suppose numbers was defined as [snippet above].
   303      # When running this pipeline, you'd get a runtime error,
   304      # possibly on a remote machine, possibly very late.
   305  
   306      with self.assertRaises(TypeError):
   307        p.run()
   308  
   309      # To catch this early, we can assert what types we expect.
   310      with self.assertRaises(typehints.TypeCheckError):
   311        # [START type_hints_takes]
   312        evens = numbers | beam.Filter(lambda x: x % 2 == 0).with_input_types(int)
   313        # [END type_hints_takes]
   314  
   315      # Type hints can be declared on DoFns and callables as well, rather
   316      # than where they're used, to be more self contained.
   317      with self.assertRaises(typehints.TypeCheckError):
   318        # [START type_hints_do_fn]
   319        @beam.typehints.with_input_types(int)
   320        class FilterEvensDoFn(beam.DoFn):
   321          def process(self, element):
   322            if element % 2 == 0:
   323              yield element
   324  
   325        evens = numbers | beam.ParDo(FilterEvensDoFn())
   326        # [END type_hints_do_fn]
   327  
   328      words = p | 'words' >> beam.Create(['a', 'bb', 'c'])
   329      # One can assert outputs and apply them to transforms as well.
   330      # Helps document the contract and checks it at pipeline construction time.
   331      # [START type_hints_transform]
   332      from typing import Tuple, TypeVar
   333  
   334      T = TypeVar('T')
   335  
   336      @beam.typehints.with_input_types(T)
   337      @beam.typehints.with_output_types(Tuple[int, T])
   338      class MyTransform(beam.PTransform):
   339        def expand(self, pcoll):
   340          return pcoll | beam.Map(lambda x: (len(x), x))
   341  
   342      words_with_lens = words | MyTransform()
   343      # [END type_hints_transform]
   344  
   345      # Given an input of str, the inferred output type would be Tuple[int, str].
   346      self.assertEqual(typehints.Tuple[int, str], words_with_lens.element_type)
   347  
   348      # pylint: disable=expression-not-assigned
   349      with self.assertRaises(typehints.TypeCheckError):
   350        words_with_lens | beam.Map(lambda x: x).with_input_types(Tuple[int, int])
   351  
   352    def test_bad_types_annotations(self):
   353      p = TestPipeline(options=PipelineOptions(pipeline_type_check=True))
   354  
   355      numbers = p | beam.Create(['1', '2', '3'])
   356  
   357      # Consider the following code.
   358      # pylint: disable=expression-not-assigned
   359      # pylint: disable=unused-variable
   360      class FilterEvensDoFn(beam.DoFn):
   361        def process(self, element):
   362          if element % 2 == 0:
   363            yield element
   364  
   365      evens = numbers | 'Untyped Filter' >> beam.ParDo(FilterEvensDoFn())
   366  
   367      # Now suppose numbers was defined as [snippet above].
   368      # When running this pipeline, you'd get a runtime error,
   369      # possibly on a remote machine, possibly very late.
   370  
   371      with self.assertRaises(TypeError):
   372        p.run()
   373  
   374      # To catch this early, we can annotate process() with the expected types.
   375      # Beam will then use these as type hints and perform type checking before
   376      # the pipeline starts.
   377      with self.assertRaises(typehints.TypeCheckError):
   378        # [START type_hints_do_fn_annotations]
   379        from typing import Iterable
   380  
   381        class TypedFilterEvensDoFn(beam.DoFn):
   382          def process(self, element: int) -> Iterable[int]:
   383            if element % 2 == 0:
   384              yield element
   385  
   386        evens = numbers | 'filter_evens' >> beam.ParDo(TypedFilterEvensDoFn())
   387        # [END type_hints_do_fn_annotations]
   388  
   389      # Another example, using a list output type. Notice that the output
   390      # annotation has an additional Optional for the else clause.
   391      with self.assertRaises(typehints.TypeCheckError):
   392        # [START type_hints_do_fn_annotations_optional]
   393        from typing import List, Optional
   394  
   395        class FilterEvensDoubleDoFn(beam.DoFn):
   396          def process(self, element: int) -> Optional[List[int]]:
   397            if element % 2 == 0:
   398              return [element, element]
   399            return None
   400  
   401        evens = numbers | 'double_evens' >> beam.ParDo(FilterEvensDoubleDoFn())
   402        # [END type_hints_do_fn_annotations_optional]
   403  
   404      # Example using an annotated function.
   405      with self.assertRaises(typehints.TypeCheckError):
   406        # [START type_hints_map_annotations]
   407        def my_fn(element: int) -> str:
   408          return 'id_' + str(element)
   409  
   410        ids = numbers | 'to_id' >> beam.Map(my_fn)
   411        # [END type_hints_map_annotations]
   412  
   413      # Example using an annotated PTransform.
   414      with self.assertRaises(typehints.TypeCheckError):
   415        # [START type_hints_ptransforms]
   416        from apache_beam.pvalue import PCollection
   417  
   418        class IntToStr(beam.PTransform):
   419          def expand(self, pcoll: PCollection[int]) -> PCollection[str]:
   420            return pcoll | beam.Map(lambda elem: str(elem))
   421  
   422        ids = numbers | 'convert to str' >> IntToStr()
   423        # [END type_hints_ptransforms]
   424  
   425    def test_runtime_checks_off(self):
   426      # We do not run the following pipeline, as it has incorrect type
   427      # information, and may fail with obscure errors, depending on the runner
   428      # implementation.
   429  
   430      # pylint: disable=expression-not-assigned
   431      # [START type_hints_runtime_off]
   432      p = TestPipeline()
   433      p | beam.Create(['a']) | beam.Map(lambda x: 3).with_output_types(str)
   434      # [END type_hints_runtime_off]
   435  
   436    def test_runtime_checks_on(self):
   437      # pylint: disable=expression-not-assigned
   438      with self.assertRaises(typehints.TypeCheckError):
   439        # [START type_hints_runtime_on]
   440        p = TestPipeline(options=PipelineOptions(runtime_type_check=True))
   441        p | beam.Create(['a']) | beam.Map(lambda x: 3).with_output_types(str)
   442        p.run()
   443        # [END type_hints_runtime_on]
   444  
   445    def test_deterministic_key(self):
   446      with TestPipeline() as p:
   447        lines = (
   448            p | beam.Create([
   449                'banana,fruit,3',
   450                'kiwi,fruit,2',
   451                'kiwi,fruit,2',
   452                'zucchini,veg,3'
   453            ]))
   454  
   455        # For pickling.
   456        global Player  # pylint: disable=global-variable-not-assigned
   457  
   458        # [START type_hints_deterministic_key]
   459        from typing import Tuple
   460  
   461        class Player(object):
   462          def __init__(self, team, name):
   463            self.team = team
   464            self.name = name
   465  
   466        class PlayerCoder(beam.coders.Coder):
   467          def encode(self, player):
   468            return ('%s:%s' % (player.team, player.name)).encode('utf-8')
   469  
   470          def decode(self, s):
   471            return Player(*s.decode('utf-8').split(':'))
   472  
   473          def is_deterministic(self):
   474            return True
   475  
   476        beam.coders.registry.register_coder(Player, PlayerCoder)
   477  
   478        def parse_player_and_score(csv):
   479          name, team, score = csv.split(',')
   480          return Player(team, name), int(score)
   481  
   482        totals = (
   483            lines
   484            | beam.Map(parse_player_and_score)
   485            | beam.CombinePerKey(sum).with_input_types(Tuple[Player, int]))
   486        # [END type_hints_deterministic_key]
   487  
   488        assert_that(
   489            totals | beam.Map(lambda k_v: (k_v[0].name, k_v[1])),
   490            equal_to([('banana', 3), ('kiwi', 4), ('zucchini', 3)]))
   491  
   492  
   493  class SnippetsTest(unittest.TestCase):
   494    # Replacing text read/write transforms with dummy transforms for testing.
   495  
   496    class DummyReadTransform(beam.PTransform):
   497      """A transform that will replace iobase.ReadFromText.
   498  
   499      To be used for testing.
   500      """
   501      def __init__(self, file_to_read=None, compression_type=None):
   502        self.file_to_read = file_to_read
   503        self.compression_type = compression_type
   504  
   505      class ReadDoFn(beam.DoFn):
   506        def __init__(self, file_to_read, compression_type):
   507          self.file_to_read = file_to_read
   508          self.compression_type = compression_type
   509          self.coder = coders.StrUtf8Coder()
   510  
   511        def process(self, element):
   512          pass
   513  
   514        def finish_bundle(self):
   515          from apache_beam.transforms import window
   516  
   517          assert self.file_to_read
   518          for file_name in glob.glob(self.file_to_read):
   519            if self.compression_type is None:
   520              with open(file_name, 'rb') as file:
   521                for record in file:
   522                  value = self.coder.decode(record.rstrip(b'\n'))
   523                  yield WindowedValue(value, -1, [window.GlobalWindow()])
   524            else:
   525              with gzip.open(file_name, 'rb') as file:
   526                for record in file:
   527                  value = self.coder.decode(record.rstrip(b'\n'))
   528                  yield WindowedValue(value, -1, [window.GlobalWindow()])
   529  
   530      def expand(self, pcoll):
   531        return pcoll | beam.Create([None]) | 'DummyReadForTesting' >> beam.ParDo(
   532            SnippetsTest.DummyReadTransform.ReadDoFn(
   533                self.file_to_read, self.compression_type))
   534  
   535    class DummyWriteTransform(beam.PTransform):
   536      """A transform that will replace iobase.WriteToText.
   537  
   538      To be used for testing.
   539      """
   540      def __init__(self, file_to_write=None, file_name_suffix=''):
   541        self.file_to_write = file_to_write
   542  
   543      class WriteDoFn(beam.DoFn):
   544        def __init__(self, file_to_write):
   545          self.file_to_write = file_to_write
   546          self.file_obj = None
   547          self.coder = ToBytesCoder()
   548  
   549        def start_bundle(self):
   550          assert self.file_to_write
   551          # Appending a UUID to create a unique file object per invocation.
   552          self.file_obj = open(self.file_to_write + str(uuid.uuid4()), 'wb')
   553  
   554        def process(self, element):
   555          assert self.file_obj
   556          self.file_obj.write(self.coder.encode(element) + b'\n')
   557  
   558        def finish_bundle(self):
   559          assert self.file_obj
   560          self.file_obj.close()
   561  
   562      def expand(self, pcoll):
   563        return pcoll | 'DummyWriteForTesting' >> beam.ParDo(
   564            SnippetsTest.DummyWriteTransform.WriteDoFn(self.file_to_write))
   565  
   566    def setUp(self):
   567      self.old_read_from_text = beam.io.ReadFromText
   568      self.old_write_to_text = beam.io.WriteToText
   569  
   570      # Monkey patching to allow testing pipelines defined in snippets.py using
   571      # real data.
   572      beam.io.ReadFromText = SnippetsTest.DummyReadTransform
   573      beam.io.WriteToText = SnippetsTest.DummyWriteTransform
   574      self.temp_files = []
   575  
   576    def tearDown(self):
   577      beam.io.ReadFromText = self.old_read_from_text
   578      beam.io.WriteToText = self.old_write_to_text
   579      # Cleanup all the temporary files created in the test.
   580      map(os.remove, self.temp_files)
   581      # Ensure that PipelineOptions subclasses have been cleaned up between tests
   582      gc.collect()
   583  
   584    def create_temp_file(self, contents=''):
   585      with tempfile.NamedTemporaryFile(delete=False) as f:
   586        f.write(contents.encode('utf-8'))
   587        self.temp_files.append(f.name)
   588        return f.name
   589  
   590    def get_output(self, path, sorted_output=True, suffix=''):
   591      all_lines = []
   592      for file_name in glob.glob(path + '*'):
   593        with open(file_name) as f:
   594          lines = f.readlines()
   595          all_lines.extend([s.rstrip('\n') for s in lines])
   596  
   597      if sorted_output:
   598        return sorted(s.rstrip('\n') for s in all_lines)
   599      return all_lines
   600  
   601    def test_model_pipelines(self):
   602      temp_path = self.create_temp_file('aa bb cc\n bb cc\n cc')
   603      result_path = temp_path + '.result'
   604      test_argv = [
   605          "unused_argv[0]",
   606          f"--input-file={temp_path}*",
   607          f"--output-path={result_path}",
   608      ]
   609      with mock.patch.object(sys, 'argv', test_argv):
   610        snippets.model_pipelines()
   611      self.assertEqual(
   612          self.get_output(result_path),
   613          [str(s) for s in [(u'aa', 1), (u'bb', 2), (u'cc', 3)]])
   614  
   615    def test_model_pcollection(self):
   616      temp_path = self.create_temp_file()
   617      snippets.model_pcollection(temp_path)
   618      self.assertEqual(
   619          self.get_output(temp_path),
   620          [
   621              'Or to take arms against a sea of troubles, ',
   622              'The slings and arrows of outrageous fortune, ',
   623              'To be, or not to be: that is the question: ',
   624              'Whether \'tis nobler in the mind to suffer ',
   625          ])
   626  
   627    def test_construct_pipeline(self):
   628      temp_path = self.create_temp_file('abc def ghi\n jkl mno pqr\n stu vwx yz')
   629      result_path = self.create_temp_file()
   630      snippets.construct_pipeline({'read': temp_path, 'write': result_path})
   631      self.assertEqual(
   632          self.get_output(result_path),
   633          ['cba', 'fed', 'ihg', 'lkj', 'onm', 'rqp', 'uts', 'xwv', 'zy'])
   634  
   635    def test_model_custom_source(self):
   636      snippets.model_custom_source(100)
   637  
   638    def test_model_custom_sink(self):
   639      tempdir_name = tempfile.mkdtemp()
   640  
   641      class SimpleKV(object):
   642        def __init__(self, tmp_dir):
   643          self._dummy_token = 'dummy_token'
   644          self._tmp_dir = tmp_dir
   645  
   646        def connect(self, url):
   647          return self._dummy_token
   648  
   649        def open_table(self, access_token, table_name):
   650          assert access_token == self._dummy_token
   651          file_name = self._tmp_dir + os.sep + table_name
   652          assert not os.path.exists(file_name)
   653          open(file_name, 'wb').close()
   654          return table_name
   655  
   656        def write_to_table(self, access_token, table_name, key, value):
   657          assert access_token == self._dummy_token
   658          file_name = self._tmp_dir + os.sep + table_name
   659          assert os.path.exists(file_name)
   660          with open(file_name, 'ab') as f:
   661            content = (key + ':' + value + os.linesep).encode('utf-8')
   662            f.write(content)
   663  
   664        def rename_table(self, access_token, old_name, new_name):
   665          assert access_token == self._dummy_token
   666          old_file_name = self._tmp_dir + os.sep + old_name
   667          new_file_name = self._tmp_dir + os.sep + new_name
   668          assert os.path.isfile(old_file_name)
   669          assert not os.path.exists(new_file_name)
   670  
   671          os.rename(old_file_name, new_file_name)
   672  
   673      snippets.model_custom_sink(
   674          SimpleKV(tempdir_name),
   675          [('key' + str(i), 'value' + str(i)) for i in range(100)],
   676          'final_table_no_ptransform',
   677          'final_table_with_ptransform')
   678  
   679      expected_output = [
   680          'key' + str(i) + ':' + 'value' + str(i) for i in range(100)
   681      ]
   682  
   683      glob_pattern = tempdir_name + os.sep + 'final_table_no_ptransform*'
   684      output_files = glob.glob(glob_pattern)
   685      assert output_files
   686  
   687      received_output = []
   688      for file_name in output_files:
   689        with open(file_name) as f:
   690          for line in f:
   691            received_output.append(line.rstrip(os.linesep))
   692  
   693      self.assertCountEqual(expected_output, received_output)
   694  
   695      glob_pattern = tempdir_name + os.sep + 'final_table_with_ptransform*'
   696      output_files = glob.glob(glob_pattern)
   697      assert output_files
   698  
   699      received_output = []
   700      for file_name in output_files:
   701        with open(file_name) as f:
   702          for line in f:
   703            received_output.append(line.rstrip(os.linesep))
   704  
   705      self.assertCountEqual(expected_output, received_output)
   706  
   707    def test_model_textio(self):
   708      temp_path = self.create_temp_file('aa bb cc\n bb cc\n cc')
   709      result_path = temp_path + '.result'
   710      snippets.model_textio({'read': temp_path, 'write': result_path})
   711      self.assertEqual(['aa', 'bb', 'bb', 'cc', 'cc', 'cc'],
   712                       self.get_output(result_path, suffix='.csv'))
   713  
   714    def test_model_textio_compressed(self):
   715      temp_path = self.create_temp_file('aa\nbb\ncc')
   716      gzip_file_name = temp_path + '.gz'
   717      with open(temp_path, 'rb') as src, gzip.open(gzip_file_name, 'wb') as dst:
   718        dst.writelines(src)
   719        # Add the temporary gzip file to be cleaned up as well.
   720        self.temp_files.append(gzip_file_name)
   721      snippets.model_textio_compressed({'read': gzip_file_name},
   722                                       ['aa', 'bb', 'cc'])
   723  
   724    @unittest.skipIf(
   725        datastore_client is None, 'GCP dependencies are not installed')
   726    def test_model_datastoreio(self):
   727      # We cannot test DatastoreIO functionality in unit tests, therefore we limit
   728      # ourselves to making sure the pipeline containing Datastore read and write
   729      # transforms can be built.
   730      # TODO(vikasrk): Expore using Datastore Emulator.
   731      snippets.model_datastoreio()
   732  
   733    @unittest.skipIf(base_api is None, 'GCP dependencies are not installed')
   734    def test_model_bigqueryio(self):
   735      # We cannot test BigQueryIO functionality in unit tests, therefore we limit
   736      # ourselves to making sure the pipeline containing BigQuery sources and
   737      # sinks can be built.
   738      #
   739      # To run locally, set `run_locally` to `True`. You will also have to set
   740      # `project`, `dataset` and `table` to the BigQuery table the test will write
   741      # to.
   742      run_locally = False
   743      if run_locally:
   744        project = 'my-project'
   745        dataset = 'samples'  # this must already exist
   746        table = 'model_bigqueryio'  # this will be created if needed
   747  
   748        options = PipelineOptions().view_as(GoogleCloudOptions)
   749        options.project = project
   750        with beam.Pipeline(options=options) as p:
   751          snippets.model_bigqueryio(p, project, dataset, table)
   752      else:
   753        p = TestPipeline()
   754        p.options.view_as(GoogleCloudOptions).temp_location = 'gs://mylocation'
   755        snippets.model_bigqueryio(p)
   756  
   757    def _run_test_pipeline_for_options(self, fn):
   758      temp_path = self.create_temp_file('aa\nbb\ncc')
   759      result_path = temp_path + '.result'
   760      test_argv = [
   761          "unused_argv[0]",
   762          f"--input={temp_path}*",
   763          f"--output={result_path}",
   764      ]
   765      with mock.patch.object(sys, 'argv', test_argv):
   766        fn()
   767      self.assertEqual(['aa', 'bb', 'cc'], self.get_output(result_path))
   768  
   769    def test_pipeline_options_local(self):
   770      self._run_test_pipeline_for_options(snippets.pipeline_options_local)
   771  
   772    def test_pipeline_options_remote(self):
   773      self._run_test_pipeline_for_options(snippets.pipeline_options_remote)
   774  
   775    def test_pipeline_options_command_line(self):
   776      self._run_test_pipeline_for_options(snippets.pipeline_options_command_line)
   777  
   778    def test_pipeline_logging(self):
   779      result_path = self.create_temp_file()
   780      lines = [
   781          'we found love right where we are',
   782          'we found love right from the start',
   783          'we found love in a hopeless place'
   784      ]
   785      snippets.pipeline_logging(lines, result_path)
   786      self.assertEqual(
   787          sorted(' '.join(lines).split(' ')), self.get_output(result_path))
   788  
   789    @parameterized.parameterized.expand([
   790        [snippets.examples_wordcount_minimal],
   791        [snippets.examples_wordcount_wordcount],
   792        [snippets.pipeline_monitoring],
   793        [snippets.examples_wordcount_templated],
   794    ])
   795    def test_examples_wordcount(self, pipeline):
   796      temp_path = self.create_temp_file('abc def ghi\n abc jkl')
   797      result_path = self.create_temp_file()
   798      test_argv = [
   799          "unused_argv[0]",
   800          f"--input-file={temp_path}*",
   801          f"--output-path={result_path}",
   802      ]
   803      with mock.patch.object(sys, 'argv', test_argv):
   804        pipeline()
   805      self.assertEqual(
   806          self.get_output(result_path), ['abc: 2', 'def: 1', 'ghi: 1', 'jkl: 1'])
   807  
   808    def test_examples_ptransforms_templated(self):
   809      pipelines = [snippets.examples_ptransforms_templated]
   810  
   811      for pipeline in pipelines:
   812        temp_path = self.create_temp_file('1\n 2\n 3')
   813        result_path = self.create_temp_file()
   814        pipeline({'read': temp_path, 'write': result_path})
   815        self.assertEqual(self.get_output(result_path), ['11', '12', '13'])
   816  
   817    def test_examples_wordcount_debugging(self):
   818      temp_path = self.create_temp_file(
   819          'Flourish Flourish Flourish stomach abc def')
   820      result_path = self.create_temp_file()
   821      snippets.examples_wordcount_debugging({
   822          'read': temp_path, 'write': result_path
   823      })
   824      self.assertEqual(
   825          self.get_output(result_path), ['Flourish: 3', 'stomach: 1'])
   826  
   827    @unittest.skipIf(pubsub is None, 'GCP dependencies are not installed')
   828    @mock.patch('apache_beam.io.ReadFromPubSub')
   829    @mock.patch('apache_beam.io.WriteToPubSub')
   830    def test_examples_wordcount_streaming(self, *unused_mocks):
   831      def FakeReadFromPubSub(topic=None, subscription=None, values=None):
   832        expected_topic = topic
   833        expected_subscription = subscription
   834  
   835        def _inner(topic=None, subscription=None):
   836          assert topic == expected_topic
   837          assert subscription == expected_subscription
   838          return TestStream().add_elements(values)
   839  
   840        return _inner
   841  
   842      class AssertTransform(beam.PTransform):
   843        def __init__(self, matcher):
   844          self.matcher = matcher
   845  
   846        def expand(self, pcoll):
   847          assert_that(pcoll, self.matcher)
   848  
   849      def FakeWriteToPubSub(topic=None, values=None):
   850        expected_topic = topic
   851  
   852        def _inner(topic=None, subscription=None):
   853          assert topic == expected_topic
   854          return AssertTransform(equal_to(values))
   855  
   856        return _inner
   857  
   858      # Test basic execution.
   859      input_topic = 'projects/fake-beam-test-project/topic/intopic'
   860      input_values = [
   861          TimestampedValue(b'a a b', 1),
   862          TimestampedValue(u'🤷 ¯\\_(ツ)_/¯ b b '.encode('utf-8'), 12),
   863          TimestampedValue(b'a b c c c', 20)
   864      ]
   865      output_topic = 'projects/fake-beam-test-project/topic/outtopic'
   866      output_values = [b'a: 1', b'a: 2', b'b: 1', b'b: 3', b'c: 3']
   867      beam.io.ReadFromPubSub = (
   868          FakeReadFromPubSub(topic=input_topic, values=input_values))
   869      beam.io.WriteToPubSub = (
   870          FakeWriteToPubSub(topic=output_topic, values=output_values))
   871      test_argv = [
   872          'unused_argv[0]',
   873          '--input_topic',
   874          'projects/fake-beam-test-project/topic/intopic',
   875          '--output_topic',
   876          'projects/fake-beam-test-project/topic/outtopic'
   877      ]
   878      with mock.patch.object(sys, 'argv', test_argv):
   879        snippets.examples_wordcount_streaming()
   880  
   881      # Test with custom subscription.
   882      input_sub = 'projects/fake-beam-test-project/subscriptions/insub'
   883      beam.io.ReadFromPubSub = FakeReadFromPubSub(
   884          subscription=input_sub, values=input_values)
   885      test_argv = [
   886          'unused_argv[0]',
   887          '--input_subscription',
   888          'projects/fake-beam-test-project/subscriptions/insub',
   889          '--output_topic',
   890          'projects/fake-beam-test-project/topic/outtopic'
   891      ]
   892      with mock.patch.object(sys, 'argv', test_argv):
   893        snippets.examples_wordcount_streaming()
   894  
   895    def test_model_composite_transform_example(self):
   896      contents = ['aa bb cc', 'bb cc', 'cc']
   897      result_path = self.create_temp_file()
   898      snippets.model_composite_transform_example(contents, result_path)
   899      self.assertEqual(['aa: 1', 'bb: 2', 'cc: 3'], self.get_output(result_path))
   900  
   901    def test_model_multiple_pcollections_flatten(self):
   902      contents = ['a', 'b', 'c', 'd', 'e', 'f']
   903      result_path = self.create_temp_file()
   904      snippets.model_multiple_pcollections_flatten(contents, result_path)
   905      self.assertEqual(contents, self.get_output(result_path))
   906  
   907    def test_model_multiple_pcollections_partition(self):
   908      contents = [17, 42, 64, 32, 0, 99, 53, 89]
   909      result_path = self.create_temp_file()
   910      snippets.model_multiple_pcollections_partition(contents, result_path)
   911      self.assertEqual(['0', '17', '32', '42', '53', '64', '89', '99'],
   912                       self.get_output(result_path))
   913  
   914    def test_model_group_by_key(self):
   915      contents = ['a bb ccc bb bb a']
   916      result_path = self.create_temp_file()
   917      snippets.model_group_by_key(contents, result_path)
   918      expected = [('a', 2), ('bb', 3), ('ccc', 1)]
   919      self.assertEqual([str(s) for s in expected], self.get_output(result_path))
   920  
   921    def test_model_co_group_by_key_tuple(self):
   922      with TestPipeline() as p:
   923        # [START model_group_by_key_cogroupbykey_tuple_inputs]
   924        emails_list = [
   925            ('amy', 'amy@example.com'),
   926            ('carl', 'carl@example.com'),
   927            ('julia', 'julia@example.com'),
   928            ('carl', 'carl@email.com'),
   929        ]
   930        phones_list = [
   931            ('amy', '111-222-3333'),
   932            ('james', '222-333-4444'),
   933            ('amy', '333-444-5555'),
   934            ('carl', '444-555-6666'),
   935        ]
   936  
   937        emails = p | 'CreateEmails' >> beam.Create(emails_list)
   938        phones = p | 'CreatePhones' >> beam.Create(phones_list)
   939        # [END model_group_by_key_cogroupbykey_tuple_inputs]
   940  
   941        result_path = self.create_temp_file()
   942        snippets.model_co_group_by_key_tuple(emails, phones, result_path)
   943  
   944      # [START model_group_by_key_cogroupbykey_tuple_outputs]
   945      results = [
   946          (
   947              'amy',
   948              {
   949                  'emails': ['amy@example.com'],
   950                  'phones': ['111-222-3333', '333-444-5555']
   951              }),
   952          (
   953              'carl',
   954              {
   955                  'emails': ['carl@email.com', 'carl@example.com'],
   956                  'phones': ['444-555-6666']
   957              }),
   958          ('james', {
   959              'emails': [], 'phones': ['222-333-4444']
   960          }),
   961          ('julia', {
   962              'emails': ['julia@example.com'], 'phones': []
   963          }),
   964      ]
   965      # [END model_group_by_key_cogroupbykey_tuple_outputs]
   966      # [START model_group_by_key_cogroupbykey_tuple_formatted_outputs]
   967      formatted_results = [
   968          "amy; ['amy@example.com']; ['111-222-3333', '333-444-5555']",
   969          "carl; ['carl@email.com', 'carl@example.com']; ['444-555-6666']",
   970          "james; []; ['222-333-4444']",
   971          "julia; ['julia@example.com']; []",
   972      ]
   973      # [END model_group_by_key_cogroupbykey_tuple_formatted_outputs]
   974      expected_results = [
   975          '%s; %s; %s' % (name, info['emails'], info['phones']) for name,
   976          info in results
   977      ]
   978      self.assertEqual(expected_results, formatted_results)
   979      self.assertEqual(formatted_results, self.get_output(result_path))
   980  
   981    def test_model_use_and_query_metrics(self):
   982      """DebuggingWordCount example snippets."""
   983  
   984      import re
   985  
   986      p = TestPipeline()  # Use TestPipeline for testing.
   987      words = p | beam.Create(
   988          ['albert', 'sam', 'mark', 'sarah', 'swati', 'daniel', 'andrea'])
   989  
   990      # pylint: disable=unused-variable
   991      # [START metrics_usage_example]
   992      class FilterTextFn(beam.DoFn):
   993        """A DoFn that filters for a specific key based on a regex."""
   994        def __init__(self, pattern):
   995          self.pattern = pattern
   996          # A custom metric can track values in your pipeline as it runs. Create
   997          # custom metrics to count unmatched words, and know the distribution of
   998          # word lengths in the input PCollection.
   999          self.word_len_dist = Metrics.distribution(
  1000              self.__class__, 'word_len_dist')
  1001          self.unmatched_words = Metrics.counter(
  1002              self.__class__, 'unmatched_words')
  1003  
  1004        def process(self, element):
  1005          word = element
  1006          self.word_len_dist.update(len(word))
  1007          if re.match(self.pattern, word):
  1008            yield element
  1009          else:
  1010            self.unmatched_words.inc()
  1011  
  1012      filtered_words = (words | 'FilterText' >> beam.ParDo(FilterTextFn('s.*')))
  1013      # [END metrics_usage_example]
  1014      # pylint: enable=unused-variable
  1015  
  1016      # [START metrics_check_values_example]
  1017      result = p.run()
  1018      result.wait_until_finish()
  1019  
  1020      custom_distribution = result.metrics().query(
  1021          MetricsFilter().with_name('word_len_dist'))['distributions']
  1022      custom_counter = result.metrics().query(
  1023          MetricsFilter().with_name('unmatched_words'))['counters']
  1024  
  1025      if custom_distribution:
  1026        logging.info(
  1027            'The average word length was %d',
  1028            custom_distribution[0].committed.mean)
  1029      if custom_counter:
  1030        logging.info(
  1031            'There were %d words that did not match the filter.',
  1032            custom_counter[0].committed)
  1033      # [END metrics_check_values_example]
  1034  
  1035      # There should be 4 words that did not match
  1036      self.assertEqual(custom_counter[0].committed, 4)
  1037      # The shortest word is 3 characters, the longest is 6
  1038      self.assertEqual(custom_distribution[0].committed.min, 3)
  1039      self.assertEqual(custom_distribution[0].committed.max, 6)
  1040  
  1041    def test_model_join_using_side_inputs(self):
  1042      name_list = ['a', 'b']
  1043      email_list = [['a', 'a@example.com'], ['b', 'b@example.com']]
  1044      phone_list = [['a', 'x4312'], ['b', 'x8452']]
  1045      result_path = self.create_temp_file()
  1046      snippets.model_join_using_side_inputs(
  1047          name_list, email_list, phone_list, result_path)
  1048      expect = ['a; a@example.com; x4312', 'b; b@example.com; x8452']
  1049      self.assertEqual(expect, self.get_output(result_path))
  1050  
  1051    def test_model_early_late_triggers(self):
  1052      pipeline_options = PipelineOptions()
  1053      pipeline_options.view_as(StandardOptions).streaming = True
  1054  
  1055      with TestPipeline(options=pipeline_options) as p:
  1056        test_stream = (
  1057            TestStream().advance_watermark_to(10).add_elements([
  1058                'a', 'a', 'a', 'b', 'b'
  1059            ]).add_elements([
  1060                TimestampedValue('a', 10)
  1061            ]).advance_watermark_to(20).advance_processing_time(60).add_elements(
  1062                [TimestampedValue('a', 10)]))
  1063        trigger = (
  1064            # [START model_early_late_triggers]
  1065            AfterWatermark(
  1066                early=AfterProcessingTime(delay=1 * 60), late=AfterCount(1))
  1067            # [END model_early_late_triggers]
  1068        )
  1069        counts = (
  1070            p
  1071            | test_stream
  1072            | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
  1073            | WindowInto(
  1074                FixedWindows(15),
  1075                trigger=trigger,
  1076                allowed_lateness=20,
  1077                accumulation_mode=AccumulationMode.DISCARDING)
  1078            | 'group' >> beam.GroupByKey()
  1079            | 'count' >>
  1080            beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))))
  1081        assert_that(counts, equal_to([('a', 4), ('b', 2), ('a', 1)]))
  1082  
  1083    def test_model_setting_trigger(self):
  1084      pipeline_options = PipelineOptions(
  1085          flags=['--streaming', '--allow_unsafe_triggers'])
  1086  
  1087      with TestPipeline(options=pipeline_options) as p:
  1088        test_stream = (
  1089            TestStream().advance_watermark_to(10).add_elements(
  1090                ['a', 'a', 'a', 'b',
  1091                 'b']).advance_watermark_to(70).advance_processing_time(600))
  1092        pcollection = (
  1093            p
  1094            | test_stream
  1095            | 'pair_with_one' >> beam.Map(lambda x: (x, 1)))
  1096  
  1097        counts = (
  1098            pcollection | WindowInto(
  1099                FixedWindows(1 * 60),
  1100                trigger=AfterProcessingTime(10 * 60),
  1101                accumulation_mode=AccumulationMode.DISCARDING)
  1102            | 'group' >> beam.GroupByKey()
  1103            | 'count' >>
  1104            beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))))
  1105        assert_that(counts, equal_to([('a', 3), ('b', 2)]))
  1106  
  1107    def test_model_composite_triggers(self):
  1108      pipeline_options = PipelineOptions()
  1109      pipeline_options.view_as(StandardOptions).streaming = True
  1110  
  1111      with TestPipeline(options=pipeline_options) as p:
  1112        test_stream = (
  1113            TestStream().advance_watermark_to(10).add_elements(
  1114                ['a', 'a', 'a', 'b', 'b']).advance_watermark_to(70).add_elements([
  1115                    TimestampedValue('a', 10),
  1116                    TimestampedValue('a', 10),
  1117                    TimestampedValue('c', 10),
  1118                    TimestampedValue('c', 10)
  1119                ]).advance_processing_time(600))
  1120        pcollection = (
  1121            p
  1122            | test_stream
  1123            | 'pair_with_one' >> beam.Map(lambda x: (x, 1)))
  1124  
  1125        counts = (
  1126            # [START model_composite_triggers]
  1127            pcollection | WindowInto(
  1128                FixedWindows(1 * 60),
  1129                trigger=AfterWatermark(late=AfterProcessingTime(10 * 60)),
  1130                allowed_lateness=10,
  1131                accumulation_mode=AccumulationMode.DISCARDING)
  1132            # [END model_composite_triggers]
  1133            | 'group' >> beam.GroupByKey()
  1134            | 'count' >>
  1135            beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))))
  1136        assert_that(counts, equal_to([('a', 3), ('b', 2), ('a', 2), ('c', 2)]))
  1137  
  1138    def test_model_other_composite_triggers(self):
  1139      pipeline_options = PipelineOptions(
  1140          flags=['--streaming', '--allow_unsafe_triggers'])
  1141  
  1142      with TestPipeline(options=pipeline_options) as p:
  1143        test_stream = (
  1144            TestStream().advance_watermark_to(10).add_elements(
  1145                ['a', 'a']).add_elements(
  1146                    ['a', 'b',
  1147                     'b']).advance_processing_time(60).add_elements(['a'] * 100))
  1148        pcollection = (
  1149            p
  1150            | test_stream
  1151            | 'pair_with_one' >> beam.Map(lambda x: (x, 1)))
  1152  
  1153        counts = (
  1154            # [START model_other_composite_triggers]
  1155            pcollection | WindowInto(
  1156                FixedWindows(1 * 60),
  1157                trigger=Repeatedly(
  1158                    AfterAny(AfterCount(100), AfterProcessingTime(1 * 60))),
  1159                accumulation_mode=AccumulationMode.DISCARDING)
  1160            # [END model_other_composite_triggers]
  1161            | 'group' >> beam.GroupByKey()
  1162            | 'count' >>
  1163            beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))))
  1164        assert_that(counts, equal_to([('a', 3), ('b', 2), ('a', 100)]))
  1165  
  1166  
  1167  class CombineTest(unittest.TestCase):
  1168    """Tests for model/combine."""
  1169    def test_global_sum(self):
  1170      pc = [1, 2, 3]
  1171      # [START global_sum]
  1172      result = pc | beam.CombineGlobally(sum)
  1173      # [END global_sum]
  1174      self.assertEqual([6], result)
  1175  
  1176    def test_combine_values(self):
  1177      occurences = [('cat', 1), ('cat', 5), ('cat', 9), ('dog', 5), ('dog', 2)]
  1178      # [START combine_values]
  1179      first_occurences = occurences | beam.GroupByKey() | beam.CombineValues(min)
  1180      # [END combine_values]
  1181      self.assertEqual({('cat', 1), ('dog', 2)}, set(first_occurences))
  1182  
  1183    def test_combine_per_key(self):
  1184      player_accuracies = [('cat', 1), ('cat', 5), ('cat', 9), ('cat', 1),
  1185                           ('dog', 5), ('dog', 2)]
  1186      # [START combine_per_key]
  1187      avg_accuracy_per_player = (
  1188          player_accuracies
  1189          | beam.CombinePerKey(beam.combiners.MeanCombineFn()))
  1190      # [END combine_per_key]
  1191      self.assertEqual({('cat', 4.0), ('dog', 3.5)}, set(avg_accuracy_per_player))
  1192  
  1193    def test_combine_concat(self):
  1194      pc = ['a', 'b']
  1195  
  1196      # [START combine_concat]
  1197      def concat(values, separator=', '):
  1198        return separator.join(values)
  1199  
  1200      with_commas = pc | beam.CombineGlobally(concat)
  1201      with_dashes = pc | beam.CombineGlobally(concat, separator='-')
  1202      # [END combine_concat]
  1203      self.assertEqual(1, len(with_commas))
  1204      self.assertTrue(with_commas[0] in {'a, b', 'b, a'})
  1205      self.assertEqual(1, len(with_dashes))
  1206      self.assertTrue(with_dashes[0] in {'a-b', 'b-a'})
  1207  
  1208    def test_bounded_sum(self):
  1209      # [START combine_bounded_sum]
  1210      pc = [1, 10, 100, 1000]
  1211  
  1212      def bounded_sum(values, bound=500):
  1213        return min(sum(values), bound)
  1214  
  1215      small_sum = pc | beam.CombineGlobally(bounded_sum)  # [500]
  1216      large_sum = pc | beam.CombineGlobally(bounded_sum, bound=5000)  # [1111]
  1217      # [END combine_bounded_sum]
  1218      self.assertEqual([500], small_sum)
  1219      self.assertEqual([1111], large_sum)
  1220  
  1221    def test_combine_reduce(self):
  1222      factors = [2, 3, 5, 7]
  1223      # [START combine_reduce]
  1224      import functools
  1225      import operator
  1226      product = factors | beam.CombineGlobally(
  1227          functools.partial(functools.reduce, operator.mul), 1)
  1228      # [END combine_reduce]
  1229      self.assertEqual([210], product)
  1230  
  1231    def test_custom_average(self):
  1232      pc = [2, 3, 5, 7]
  1233  
  1234      # [START combine_custom_average_define]
  1235      class AverageFn(beam.CombineFn):
  1236        def create_accumulator(self):
  1237          return (0.0, 0)
  1238  
  1239        def add_input(self, sum_count, input):
  1240          (sum, count) = sum_count
  1241          return sum + input, count + 1
  1242  
  1243        def merge_accumulators(self, accumulators):
  1244          sums, counts = zip(*accumulators)
  1245          return sum(sums), sum(counts)
  1246  
  1247        def extract_output(self, sum_count):
  1248          (sum, count) = sum_count
  1249          return sum / count if count else float('NaN')
  1250  
  1251      # [END combine_custom_average_define]
  1252      # [START combine_custom_average_execute]
  1253      average = pc | beam.CombineGlobally(AverageFn())
  1254      # [END combine_custom_average_execute]
  1255      self.assertEqual([4.25], average)
  1256  
  1257    def test_keys(self):
  1258      occurrences = [('cat', 1), ('cat', 5), ('dog', 5), ('cat', 9), ('dog', 2)]
  1259      unique_keys = occurrences | snippets.Keys()
  1260      self.assertEqual({'cat', 'dog'}, set(unique_keys))
  1261  
  1262    def test_count(self):
  1263      occurrences = ['cat', 'dog', 'cat', 'cat', 'dog']
  1264      perkey_counts = occurrences | snippets.Count()
  1265      self.assertEqual({('cat', 3), ('dog', 2)}, set(perkey_counts))
  1266  
  1267    def test_setting_fixed_windows(self):
  1268      with TestPipeline() as p:
  1269        unkeyed_items = p | beam.Create([22, 33, 55, 100, 115, 120])
  1270        items = (
  1271            unkeyed_items
  1272            | 'key' >>
  1273            beam.Map(lambda x: beam.window.TimestampedValue(('k', x), x)))
  1274        # [START setting_fixed_windows]
  1275        from apache_beam import window
  1276        fixed_windowed_items = (
  1277            items | 'window' >> beam.WindowInto(window.FixedWindows(60)))
  1278        # [END setting_fixed_windows]
  1279        summed = (
  1280            fixed_windowed_items
  1281            | 'group' >> beam.GroupByKey()
  1282            | 'combine' >> beam.CombineValues(sum))
  1283        unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
  1284        assert_that(unkeyed, equal_to([110, 215, 120]))
  1285  
  1286    def test_setting_sliding_windows(self):
  1287      with TestPipeline() as p:
  1288        unkeyed_items = p | beam.Create([2, 16, 23])
  1289        items = (
  1290            unkeyed_items
  1291            | 'key' >>
  1292            beam.Map(lambda x: beam.window.TimestampedValue(('k', x), x)))
  1293        # [START setting_sliding_windows]
  1294        from apache_beam import window
  1295        sliding_windowed_items = (
  1296            items | 'window' >> beam.WindowInto(window.SlidingWindows(30, 5)))
  1297        # [END setting_sliding_windows]
  1298        summed = (
  1299            sliding_windowed_items
  1300            | 'group' >> beam.GroupByKey()
  1301            | 'combine' >> beam.CombineValues(sum))
  1302        unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
  1303        assert_that(unkeyed, equal_to([2, 2, 2, 18, 23, 39, 39, 39, 41, 41]))
  1304  
  1305    def test_setting_session_windows(self):
  1306      with TestPipeline() as p:
  1307        unkeyed_items = p | beam.Create([2, 11, 16, 27])
  1308        items = (
  1309            unkeyed_items
  1310            | 'key' >>
  1311            beam.Map(lambda x: beam.window.TimestampedValue(('k', x), x * 60)))
  1312        # [START setting_session_windows]
  1313        from apache_beam import window
  1314        session_windowed_items = (
  1315            items | 'window' >> beam.WindowInto(window.Sessions(10 * 60)))
  1316        # [END setting_session_windows]
  1317        summed = (
  1318            session_windowed_items
  1319            | 'group' >> beam.GroupByKey()
  1320            | 'combine' >> beam.CombineValues(sum))
  1321        unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
  1322        assert_that(unkeyed, equal_to([29, 27]))
  1323  
  1324    def test_setting_global_window(self):
  1325      with TestPipeline() as p:
  1326        unkeyed_items = p | beam.Create([2, 11, 16, 27])
  1327        items = (
  1328            unkeyed_items
  1329            | 'key' >>
  1330            beam.Map(lambda x: beam.window.TimestampedValue(('k', x), x)))
  1331        # [START setting_global_window]
  1332        from apache_beam import window
  1333        global_windowed_items = (
  1334            items | 'window' >> beam.WindowInto(window.GlobalWindows()))
  1335        # [END setting_global_window]
  1336        summed = (
  1337            global_windowed_items
  1338            | 'group' >> beam.GroupByKey()
  1339            | 'combine' >> beam.CombineValues(sum))
  1340        unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
  1341        assert_that(unkeyed, equal_to([56]))
  1342  
  1343    def test_setting_timestamp(self):
  1344      with TestPipeline() as p:
  1345        unkeyed_items = p | beam.Create([12, 30, 60, 61, 66])
  1346        items = (unkeyed_items | 'key' >> beam.Map(lambda x: ('k', x)))
  1347  
  1348        def extract_timestamp_from_log_entry(entry):
  1349          return entry[1]
  1350  
  1351        # [START setting_timestamp]
  1352        class AddTimestampDoFn(beam.DoFn):
  1353          def process(self, element):
  1354            # Extract the numeric Unix seconds-since-epoch timestamp to be
  1355            # associated with the current log entry.
  1356            unix_timestamp = extract_timestamp_from_log_entry(element)
  1357            # Wrap and emit the current entry and new timestamp in a
  1358            # TimestampedValue.
  1359            yield beam.window.TimestampedValue(element, unix_timestamp)
  1360  
  1361        timestamped_items = items | 'timestamp' >> beam.ParDo(AddTimestampDoFn())
  1362        # [END setting_timestamp]
  1363        fixed_windowed_items = (
  1364            timestamped_items
  1365            | 'window' >> beam.WindowInto(beam.window.FixedWindows(60)))
  1366        summed = (
  1367            fixed_windowed_items
  1368            | 'group' >> beam.GroupByKey()
  1369            | 'combine' >> beam.CombineValues(sum))
  1370        unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
  1371        assert_that(unkeyed, equal_to([42, 187]))
  1372  
  1373  
  1374  class PTransformTest(unittest.TestCase):
  1375    """Tests for PTransform."""
  1376    def test_composite(self):
  1377  
  1378      # [START model_composite_transform]
  1379      class ComputeWordLengths(beam.PTransform):
  1380        def expand(self, pcoll):
  1381          # Transform logic goes here.
  1382          return pcoll | beam.Map(lambda x: len(x))
  1383  
  1384      # [END model_composite_transform]
  1385  
  1386      with TestPipeline() as p:
  1387        lengths = p | beam.Create(["a", "ab", "abc"]) | ComputeWordLengths()
  1388        assert_that(lengths, equal_to([1, 2, 3]))
  1389  
  1390  
  1391  class SlowlyChangingSideInputsTest(unittest.TestCase):
  1392    """Tests for PTransform."""
  1393    def test_side_input_slow_update(self):
  1394      temp_file = tempfile.NamedTemporaryFile(delete=True)
  1395      src_file_pattern = temp_file.name
  1396      temp_file.close()
  1397  
  1398      first_ts = math.floor(time.time()) - 30
  1399      interval = 5
  1400      main_input_windowing_interval = 7
  1401  
  1402      # aligning timestamp to get persistent results
  1403      first_ts = first_ts - (
  1404          first_ts % (interval * main_input_windowing_interval))
  1405      last_ts = first_ts + 45
  1406  
  1407      for i in range(-1, 10, 1):
  1408        count = i + 2
  1409        idstr = str(first_ts + interval * i)
  1410        with open(src_file_pattern + idstr, "w") as f:
  1411          for j in range(count):
  1412            f.write('f' + idstr + 'a' + str(j) + '\n')
  1413  
  1414      sample_main_input_elements = ([first_ts - 2, # no output due to no SI
  1415                                     first_ts + 1,  # First window
  1416                                     first_ts + 8,  # Second window
  1417                                     first_ts + 15,  # Third window
  1418                                     first_ts + 22,  # Fourth window
  1419                                     ])
  1420  
  1421      pipeline, pipeline_result = snippets.side_input_slow_update(
  1422        src_file_pattern, first_ts, last_ts, interval,
  1423        sample_main_input_elements, main_input_windowing_interval)
  1424  
  1425      try:
  1426        with pipeline:
  1427          pipeline_result = (
  1428              pipeline_result
  1429              | 'AddKey' >> beam.Map(lambda v: ('key', v))
  1430              | combiners.Count.PerKey())
  1431  
  1432          assert_that(
  1433              pipeline_result,
  1434              equal_to([('key', 3), ('key', 4), ('key', 6), ('key', 7)]))
  1435      finally:
  1436        for i in range(-1, 10, 1):
  1437          os.unlink(src_file_pattern + str(first_ts + interval * i))
  1438  
  1439  
  1440  if __name__ == '__main__':
  1441    logging.getLogger().setLevel(logging.INFO)
  1442    unittest.main()