github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/utils_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #  http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  import importlib
    19  import json
    20  import logging
    21  import tempfile
    22  import unittest
    23  from typing import NamedTuple
    24  from unittest.mock import PropertyMock
    25  from unittest.mock import patch
    26  
    27  import numpy as np
    28  import pandas as pd
    29  import pytest
    30  
    31  import apache_beam as beam
    32  from apache_beam import coders
    33  from apache_beam.dataframe.convert import to_dataframe
    34  from apache_beam.portability.api import beam_runner_api_pb2
    35  from apache_beam.runners.interactive import interactive_beam as ib
    36  from apache_beam.runners.interactive import interactive_environment as ie
    37  from apache_beam.runners.interactive import utils
    38  from apache_beam.runners.interactive.caching.cacheable import Cacheable
    39  from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
    40  from apache_beam.runners.interactive.testing.mock_ipython import mock_get_ipython
    41  from apache_beam.runners.interactive.testing.test_cache_manager import InMemoryCache
    42  from apache_beam.runners.portability.flink_runner import FlinkRunner
    43  from apache_beam.testing.test_stream import WindowedValueHolder
    44  from apache_beam.utils.timestamp import Timestamp
    45  from apache_beam.utils.windowed_value import WindowedValue
    46  
    47  # Protect against environments where apitools library is not available.
    48  try:
    49    from apitools.base.py.exceptions import HttpError
    50    from apitools.base.py.exceptions import HttpNotFoundError
    51  except ImportError:
    52    _http_error_imported = False
    53    HttpError = ValueError
    54    HttpNotFoundError = ValueError
    55  else:
    56    _http_error_imported = True
    57  
    58  
    59  class MockBuckets():
    60    def Get(self, path):
    61      if path == 'test-bucket-not-found':
    62        raise HttpNotFoundError({'status': 404}, {}, '')
    63      elif path == 'test-bucket-not-verified':
    64        raise HttpError({'status': 400}, {}, '')
    65  
    66  
    67  class MockStorageClient():
    68    def __init__(self, buckets=MockBuckets()):
    69      self.buckets = buckets
    70  
    71  
    72  class Record(NamedTuple):
    73    order_id: int
    74    product_id: int
    75    quantity: int
    76  
    77  
    78  def windowed_value(e):
    79    from apache_beam.transforms.window import GlobalWindow
    80    return WindowedValue(e, 1, [GlobalWindow()])
    81  
    82  
    83  class ParseToDataframeTest(unittest.TestCase):
    84    def test_parse_windowedvalue(self):
    85      """Tests that WindowedValues are supported but not present.
    86      """
    87  
    88      els = [windowed_value(('a', 2)), windowed_value(('b', 3))]
    89  
    90      actual_df = utils.elements_to_df(els, include_window_info=False)
    91      expected_df = pd.DataFrame([['a', 2], ['b', 3]], columns=[0, 1])
    92      # check_like so that ordering of indices doesn't matter.
    93      pd.testing.assert_frame_equal(actual_df, expected_df, check_like=True)
    94  
    95    def test_parse_windowedvalue_with_window_info(self):
    96      """Tests that WindowedValues are supported and have their own columns.
    97      """
    98  
    99      els = [windowed_value(('a', 2)), windowed_value(('b', 3))]
   100  
   101      actual_df = utils.elements_to_df(els, include_window_info=True)
   102      expected_df = pd.DataFrame(
   103          [['a', 2, int(1e6), els[0].windows, els[0].pane_info],
   104           ['b', 3, int(1e6), els[1].windows, els[1].pane_info]],
   105          columns=[0, 1, 'event_time', 'windows', 'pane_info'])
   106      # check_like so that ordering of indices doesn't matter.
   107      pd.testing.assert_frame_equal(actual_df, expected_df, check_like=True)
   108  
   109    def test_parse_windowedvalue_with_dicts(self):
   110      """Tests that dicts play well with WindowedValues.
   111      """
   112      els = [
   113          windowed_value({
   114              'b': 2, 'd': 4
   115          }),
   116          windowed_value({
   117              'a': 1, 'b': 2, 'c': 3
   118          })
   119      ]
   120  
   121      actual_df = utils.elements_to_df(els, include_window_info=True)
   122      expected_df = pd.DataFrame(
   123          [[np.nan, 2, np.nan, 4, int(1e6), els[0].windows, els[0].pane_info],
   124           [1, 2, 3, np.nan, int(1e6), els[1].windows, els[1].pane_info]],
   125          columns=['a', 'b', 'c', 'd', 'event_time', 'windows', 'pane_info'])
   126      # check_like so that ordering of indices doesn't matter.
   127      pd.testing.assert_frame_equal(actual_df, expected_df, check_like=True)
   128  
   129    def test_parse_dataframes(self):
   130      """Tests that it correctly parses a DataFrame.
   131      """
   132      deferred = to_dataframe(beam.Pipeline() | beam.Create([Record(0, 0, 0)]))
   133  
   134      els = [windowed_value(pd.DataFrame(Record(n, 0, 0))) for n in range(10)]
   135  
   136      actual_df = utils.elements_to_df(
   137          els, element_type=deferred._expr.proxy()).reset_index(drop=True)
   138      expected_df = pd.concat([e.value for e in els], ignore_index=True)
   139      pd.testing.assert_frame_equal(actual_df, expected_df)
   140  
   141    def test_parse_series(self):
   142      """Tests that it correctly parses a Pandas Series.
   143      """
   144      deferred = to_dataframe(beam.Pipeline()
   145                              | beam.Create([Record(0, 0, 0)]))['order_id']
   146  
   147      els = [windowed_value(pd.Series([n])) for n in range(10)]
   148  
   149      actual_df = utils.elements_to_df(
   150          els, element_type=deferred._expr.proxy()).reset_index(drop=True)
   151      expected_df = pd.concat([e.value for e in els], ignore_index=True)
   152      pd.testing.assert_series_equal(actual_df, expected_df)
   153  
   154  
   155  class ToElementListTest(unittest.TestCase):
   156    def test_test_stream_payload_events(self):
   157      """Tests that the to_element_list can limit the count in a single bundle."""
   158  
   159      coder = coders.FastPrimitivesCoder()
   160  
   161      def reader():
   162        element_payload = [
   163            beam_runner_api_pb2.TestStreamPayload.TimestampedElement(
   164                encoded_element=coder.encode(
   165                    WindowedValueHolder(WindowedValue(e, 0, []))),
   166                timestamp=Timestamp.of(0).micros) for e in range(10)
   167        ]
   168  
   169        event = beam_runner_api_pb2.TestStreamPayload.Event(
   170            element_event=beam_runner_api_pb2.TestStreamPayload.Event.AddElements(
   171                elements=element_payload))
   172        yield event
   173  
   174      # The reader creates 10 elements in a single TestStreamPayload but we limit
   175      # the number of elements read to 5 here. This tests that the to_element_list
   176      # can limit the number of elements in a single bundle.
   177      elements = utils.to_element_list(
   178          reader(), coder, include_window_info=False, n=5)
   179      self.assertSequenceEqual(list(elements), list(range(5)))
   180  
   181    def test_element_limit_count(self):
   182      """Tests that the to_element_list can limit the count."""
   183  
   184      elements = utils.to_element_list(
   185          iter(range(10)), None, include_window_info=False, n=5)
   186      self.assertSequenceEqual(list(elements), list(range(5)))
   187  
   188  
   189  @unittest.skipIf(
   190      not ie.current_env().is_interactive_ready,
   191      '[interactive] dependency is not installed.')
   192  class IPythonLogHandlerTest(unittest.TestCase):
   193    def setUp(self):
   194      utils.register_ipython_log_handler()
   195      self._interactive_root_logger = logging.getLogger(
   196          'apache_beam.runners.interactive')
   197  
   198    def test_ipython_log_handler_not_double_registered(self):
   199      utils.register_ipython_log_handler()
   200      ipython_log_handlers = list(
   201          filter(
   202              lambda x: isinstance(x, utils.IPythonLogHandler),
   203              [handler for handler in self._interactive_root_logger.handlers]))
   204      self.assertEqual(1, len(ipython_log_handlers))
   205  
   206    @patch('apache_beam.runners.interactive.utils.IPythonLogHandler.emit')
   207    def test_default_logging_level_is_info(self, mock_emit):
   208      # By default the logging level of loggers and log handlers are NOTSET. Also,
   209      # the propagation is default to true for all loggers. In this scenario, all
   210      # loggings from child loggers will be propagated to the interactive "root"
   211      # logger which is set to INFO level that gets handled by the sole log
   212      # handler IPythonLogHandler which is set to NOTSET. The effect will be
   213      # everything >= info level will be logged through IPython.display to
   214      # all frontends connected to current kernel.
   215      dummy_logger = logging.getLogger('apache_beam.runners.interactive.dummy1')
   216      dummy_logger.info('info')
   217      mock_emit.assert_called_once()
   218      dummy_logger.debug('debug')
   219      # Emit is not called, so it's still called once.
   220      mock_emit.assert_called_once()
   221  
   222    @patch('apache_beam.runners.interactive.utils.IPythonLogHandler.emit')
   223    def test_child_module_logger_can_override_logging_level(self, mock_emit):
   224      # When a child logger's logging level is configured to something that is not
   225      # NOTSET, it takes back the logging control from the interactive "root"
   226      # logger by not propagating anything.
   227      dummy_logger = logging.getLogger('apache_beam.runners.interactive.dummy2')
   228      dummy_logger.setLevel(logging.DEBUG)
   229      mock_emit.assert_not_called()
   230      dummy_logger.debug('debug')
   231      # Because the dummy child logger is configured to log at DEBUG level, it
   232      # now propagates DEBUG loggings to the interactive "root" logger.
   233      mock_emit.assert_called_once()
   234      # When the dummy child logger is configured to log at CRITICAL level, it
   235      # will only propagate CRITICAL loggings to the interactive "root" logger.
   236      dummy_logger.setLevel(logging.CRITICAL)
   237      # Error loggings will not be handled now.
   238      dummy_logger.error('error')
   239      # Emit is not called, so it's still called once.
   240      mock_emit.assert_called_once()
   241  
   242  
   243  @unittest.skipIf(
   244      not ie.current_env().is_interactive_ready,
   245      '[interactive] dependency is not installed.')
   246  @pytest.mark.skipif(
   247      not ie.current_env().is_interactive_ready,
   248      reason='[interactive] dependency is not installed.')
   249  class ProgressIndicatorTest(unittest.TestCase):
   250    def setUp(self):
   251      ie.new_env()
   252  
   253    @patch('IPython.get_ipython', new_callable=mock_get_ipython)
   254    @patch(
   255        'apache_beam.runners.interactive.interactive_environment'
   256        '.InteractiveEnvironment.is_in_notebook',
   257        new_callable=PropertyMock)
   258    def test_progress_in_plain_text_when_not_in_notebook(
   259        self, mocked_is_in_notebook, unused):
   260      mocked_is_in_notebook.return_value = False
   261  
   262      with patch('IPython.display.display') as mocked_display:
   263  
   264        @utils.progress_indicated
   265        def progress_indicated_dummy():
   266          mocked_display.assert_any_call('Processing... progress_indicated_dummy')
   267  
   268        progress_indicated_dummy()
   269        mocked_display.assert_any_call('Done.')
   270  
   271    @patch('IPython.get_ipython', new_callable=mock_get_ipython)
   272    @patch(
   273        'apache_beam.runners.interactive.interactive_environment'
   274        '.InteractiveEnvironment.is_in_notebook',
   275        new_callable=PropertyMock)
   276    def test_progress_in_HTML_JS_when_in_notebook(
   277        self, mocked_is_in_notebook, unused):
   278      mocked_is_in_notebook.return_value = True
   279  
   280      with patch('IPython.display.HTML') as mocked_html,\
   281        patch('IPython.display.Javascript') as mocked_js:
   282        with utils.ProgressIndicator('enter', 'exit'):
   283          mocked_html.assert_called()
   284        mocked_js.assert_called()
   285  
   286  
   287  @unittest.skipIf(
   288      not ie.current_env().is_interactive_ready,
   289      '[interactive] dependency is not installed.')
   290  class MessagingUtilTest(unittest.TestCase):
   291    SAMPLE_DATA = {'a': [1, 2, 3], 'b': 4, 'c': '5', 'd': {'e': 'f'}}
   292  
   293    def setUp(self):
   294      ie.new_env()
   295  
   296    def test_as_json_decorator(self):
   297      @utils.as_json
   298      def dummy():
   299        return MessagingUtilTest.SAMPLE_DATA
   300  
   301      # As of Python 3.6, for the CPython implementation of Python,
   302      # dictionaries remember the order of items inserted.
   303      self.assertEqual(json.loads(dummy()), MessagingUtilTest.SAMPLE_DATA)
   304  
   305  
   306  class GeneralUtilTest(unittest.TestCase):
   307    def test_pcoll_by_name(self):
   308      p = beam.Pipeline()
   309      pcoll = p | beam.Create([1])
   310      ib.watch({'p': p, 'pcoll': pcoll})
   311  
   312      name_to_pcoll = utils.pcoll_by_name()
   313      self.assertIn('pcoll', name_to_pcoll)
   314  
   315    def test_cacheables(self):
   316      p2 = beam.Pipeline()
   317      pcoll2 = p2 | beam.Create([2])
   318      ib.watch({'p2': p2, 'pcoll2': pcoll2})
   319  
   320      cacheables = utils.cacheables()
   321      cacheable_key = Cacheable.from_pcoll('pcoll2', pcoll2).to_key()
   322      self.assertIn(cacheable_key, cacheables)
   323  
   324    def test_has_unbounded_source(self):
   325      p = beam.Pipeline()
   326      ie.current_env().set_cache_manager(InMemoryCache(), p)
   327      _ = p | 'ReadUnboundedSource' >> beam.io.ReadFromPubSub(
   328          subscription='projects/fake-project/subscriptions/fake_sub')
   329      self.assertTrue(utils.has_unbounded_sources(p))
   330  
   331    def test_not_has_unbounded_source(self):
   332      p = beam.Pipeline()
   333      ie.current_env().set_cache_manager(InMemoryCache(), p)
   334      with tempfile.NamedTemporaryFile(delete=False) as f:
   335        f.write(b'test')
   336      _ = p | 'ReadBoundedSource' >> beam.io.ReadFromText(f.name)
   337      self.assertFalse(utils.has_unbounded_sources(p))
   338  
   339    def test_find_pcoll_name(self):
   340      p = beam.Pipeline()
   341      pcoll = p | beam.Create([1, 2, 3])
   342      ib.watch({
   343          'p_test_find_pcoll_name': p,
   344          'pcoll_test_find_pcoll_name': pcoll,
   345      })
   346      self.assertEqual('pcoll_test_find_pcoll_name', utils.find_pcoll_name(pcoll))
   347  
   348    def test_create_var_in_main(self):
   349      name = 'test_create_var_in_main'
   350      value = Record(0, 0, 0)
   351      _ = utils.create_var_in_main(name, value)
   352      main_session = importlib.import_module('__main__')
   353      self.assertIs(getattr(main_session, name, None), value)
   354  
   355  
   356  @patch(
   357      'apache_beam.io.gcp.internal.clients.storage.StorageV1',
   358      return_value=MockStorageClient())
   359  @unittest.skipIf(not _http_error_imported, 'http errors are not imported.')
   360  class GCSUtilsTest(unittest.TestCase):
   361    @patch(
   362        'apache_beam.io.gcp.internal.clients.storage.StorageBucketsGetRequest',
   363        return_value='test-bucket-not-found')
   364    def test_assert_bucket_exists_not_found(self, mock_response, mock_client):
   365      with self.assertRaises(ValueError):
   366        utils.assert_bucket_exists('')
   367  
   368    @patch(
   369        'apache_beam.io.gcp.internal.clients.storage.StorageBucketsGetRequest',
   370        return_value='test-bucket-not-verified')
   371    def test_assert_bucket_exists_not_verified(self, mock_response, mock_client):
   372      from apache_beam.runners.interactive.utils import _LOGGER
   373      with self.assertLogs(_LOGGER, level='WARNING'):
   374        utils.assert_bucket_exists('')
   375  
   376    @patch(
   377        'apache_beam.io.gcp.internal.clients.storage.StorageBucketsGetRequest',
   378        return_value='test-bucket-found')
   379    def test_assert_bucket_exists_found(self, mock_response, mock_client):
   380      utils.assert_bucket_exists('')
   381  
   382  
   383  class PipelineUtilTest(unittest.TestCase):
   384    def test_detect_pipeline_underlying_runner(self):
   385      p = beam.Pipeline(InteractiveRunner(underlying_runner=FlinkRunner()))
   386      pipeline_runner = utils.detect_pipeline_runner(p)
   387      self.assertTrue(isinstance(pipeline_runner, FlinkRunner))
   388  
   389    def test_detect_pipeline_no_underlying_runner(self):
   390      p = beam.Pipeline(InteractiveRunner())
   391      pipeline_runner = utils.detect_pipeline_runner(p)
   392      from apache_beam.runners.direct.direct_runner import DirectRunner
   393      self.assertTrue(isinstance(pipeline_runner, DirectRunner))
   394  
   395    def test_detect_pipeline_no_runner(self):
   396      pipeline_runner = utils.detect_pipeline_runner(None)
   397      self.assertEqual(pipeline_runner, None)
   398  
   399  
   400  if __name__ == '__main__':
   401    unittest.main()