github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/utils_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 import importlib 19 import json 20 import logging 21 import tempfile 22 import unittest 23 from typing import NamedTuple 24 from unittest.mock import PropertyMock 25 from unittest.mock import patch 26 27 import numpy as np 28 import pandas as pd 29 import pytest 30 31 import apache_beam as beam 32 from apache_beam import coders 33 from apache_beam.dataframe.convert import to_dataframe 34 from apache_beam.portability.api import beam_runner_api_pb2 35 from apache_beam.runners.interactive import interactive_beam as ib 36 from apache_beam.runners.interactive import interactive_environment as ie 37 from apache_beam.runners.interactive import utils 38 from apache_beam.runners.interactive.caching.cacheable import Cacheable 39 from apache_beam.runners.interactive.interactive_runner import InteractiveRunner 40 from apache_beam.runners.interactive.testing.mock_ipython import mock_get_ipython 41 from apache_beam.runners.interactive.testing.test_cache_manager import InMemoryCache 42 from apache_beam.runners.portability.flink_runner import FlinkRunner 43 from apache_beam.testing.test_stream import WindowedValueHolder 44 from apache_beam.utils.timestamp import Timestamp 45 from apache_beam.utils.windowed_value import WindowedValue 46 47 # Protect against environments where apitools library is not available. 48 try: 49 from apitools.base.py.exceptions import HttpError 50 from apitools.base.py.exceptions import HttpNotFoundError 51 except ImportError: 52 _http_error_imported = False 53 HttpError = ValueError 54 HttpNotFoundError = ValueError 55 else: 56 _http_error_imported = True 57 58 59 class MockBuckets(): 60 def Get(self, path): 61 if path == 'test-bucket-not-found': 62 raise HttpNotFoundError({'status': 404}, {}, '') 63 elif path == 'test-bucket-not-verified': 64 raise HttpError({'status': 400}, {}, '') 65 66 67 class MockStorageClient(): 68 def __init__(self, buckets=MockBuckets()): 69 self.buckets = buckets 70 71 72 class Record(NamedTuple): 73 order_id: int 74 product_id: int 75 quantity: int 76 77 78 def windowed_value(e): 79 from apache_beam.transforms.window import GlobalWindow 80 return WindowedValue(e, 1, [GlobalWindow()]) 81 82 83 class ParseToDataframeTest(unittest.TestCase): 84 def test_parse_windowedvalue(self): 85 """Tests that WindowedValues are supported but not present. 86 """ 87 88 els = [windowed_value(('a', 2)), windowed_value(('b', 3))] 89 90 actual_df = utils.elements_to_df(els, include_window_info=False) 91 expected_df = pd.DataFrame([['a', 2], ['b', 3]], columns=[0, 1]) 92 # check_like so that ordering of indices doesn't matter. 93 pd.testing.assert_frame_equal(actual_df, expected_df, check_like=True) 94 95 def test_parse_windowedvalue_with_window_info(self): 96 """Tests that WindowedValues are supported and have their own columns. 97 """ 98 99 els = [windowed_value(('a', 2)), windowed_value(('b', 3))] 100 101 actual_df = utils.elements_to_df(els, include_window_info=True) 102 expected_df = pd.DataFrame( 103 [['a', 2, int(1e6), els[0].windows, els[0].pane_info], 104 ['b', 3, int(1e6), els[1].windows, els[1].pane_info]], 105 columns=[0, 1, 'event_time', 'windows', 'pane_info']) 106 # check_like so that ordering of indices doesn't matter. 107 pd.testing.assert_frame_equal(actual_df, expected_df, check_like=True) 108 109 def test_parse_windowedvalue_with_dicts(self): 110 """Tests that dicts play well with WindowedValues. 111 """ 112 els = [ 113 windowed_value({ 114 'b': 2, 'd': 4 115 }), 116 windowed_value({ 117 'a': 1, 'b': 2, 'c': 3 118 }) 119 ] 120 121 actual_df = utils.elements_to_df(els, include_window_info=True) 122 expected_df = pd.DataFrame( 123 [[np.nan, 2, np.nan, 4, int(1e6), els[0].windows, els[0].pane_info], 124 [1, 2, 3, np.nan, int(1e6), els[1].windows, els[1].pane_info]], 125 columns=['a', 'b', 'c', 'd', 'event_time', 'windows', 'pane_info']) 126 # check_like so that ordering of indices doesn't matter. 127 pd.testing.assert_frame_equal(actual_df, expected_df, check_like=True) 128 129 def test_parse_dataframes(self): 130 """Tests that it correctly parses a DataFrame. 131 """ 132 deferred = to_dataframe(beam.Pipeline() | beam.Create([Record(0, 0, 0)])) 133 134 els = [windowed_value(pd.DataFrame(Record(n, 0, 0))) for n in range(10)] 135 136 actual_df = utils.elements_to_df( 137 els, element_type=deferred._expr.proxy()).reset_index(drop=True) 138 expected_df = pd.concat([e.value for e in els], ignore_index=True) 139 pd.testing.assert_frame_equal(actual_df, expected_df) 140 141 def test_parse_series(self): 142 """Tests that it correctly parses a Pandas Series. 143 """ 144 deferred = to_dataframe(beam.Pipeline() 145 | beam.Create([Record(0, 0, 0)]))['order_id'] 146 147 els = [windowed_value(pd.Series([n])) for n in range(10)] 148 149 actual_df = utils.elements_to_df( 150 els, element_type=deferred._expr.proxy()).reset_index(drop=True) 151 expected_df = pd.concat([e.value for e in els], ignore_index=True) 152 pd.testing.assert_series_equal(actual_df, expected_df) 153 154 155 class ToElementListTest(unittest.TestCase): 156 def test_test_stream_payload_events(self): 157 """Tests that the to_element_list can limit the count in a single bundle.""" 158 159 coder = coders.FastPrimitivesCoder() 160 161 def reader(): 162 element_payload = [ 163 beam_runner_api_pb2.TestStreamPayload.TimestampedElement( 164 encoded_element=coder.encode( 165 WindowedValueHolder(WindowedValue(e, 0, []))), 166 timestamp=Timestamp.of(0).micros) for e in range(10) 167 ] 168 169 event = beam_runner_api_pb2.TestStreamPayload.Event( 170 element_event=beam_runner_api_pb2.TestStreamPayload.Event.AddElements( 171 elements=element_payload)) 172 yield event 173 174 # The reader creates 10 elements in a single TestStreamPayload but we limit 175 # the number of elements read to 5 here. This tests that the to_element_list 176 # can limit the number of elements in a single bundle. 177 elements = utils.to_element_list( 178 reader(), coder, include_window_info=False, n=5) 179 self.assertSequenceEqual(list(elements), list(range(5))) 180 181 def test_element_limit_count(self): 182 """Tests that the to_element_list can limit the count.""" 183 184 elements = utils.to_element_list( 185 iter(range(10)), None, include_window_info=False, n=5) 186 self.assertSequenceEqual(list(elements), list(range(5))) 187 188 189 @unittest.skipIf( 190 not ie.current_env().is_interactive_ready, 191 '[interactive] dependency is not installed.') 192 class IPythonLogHandlerTest(unittest.TestCase): 193 def setUp(self): 194 utils.register_ipython_log_handler() 195 self._interactive_root_logger = logging.getLogger( 196 'apache_beam.runners.interactive') 197 198 def test_ipython_log_handler_not_double_registered(self): 199 utils.register_ipython_log_handler() 200 ipython_log_handlers = list( 201 filter( 202 lambda x: isinstance(x, utils.IPythonLogHandler), 203 [handler for handler in self._interactive_root_logger.handlers])) 204 self.assertEqual(1, len(ipython_log_handlers)) 205 206 @patch('apache_beam.runners.interactive.utils.IPythonLogHandler.emit') 207 def test_default_logging_level_is_info(self, mock_emit): 208 # By default the logging level of loggers and log handlers are NOTSET. Also, 209 # the propagation is default to true for all loggers. In this scenario, all 210 # loggings from child loggers will be propagated to the interactive "root" 211 # logger which is set to INFO level that gets handled by the sole log 212 # handler IPythonLogHandler which is set to NOTSET. The effect will be 213 # everything >= info level will be logged through IPython.display to 214 # all frontends connected to current kernel. 215 dummy_logger = logging.getLogger('apache_beam.runners.interactive.dummy1') 216 dummy_logger.info('info') 217 mock_emit.assert_called_once() 218 dummy_logger.debug('debug') 219 # Emit is not called, so it's still called once. 220 mock_emit.assert_called_once() 221 222 @patch('apache_beam.runners.interactive.utils.IPythonLogHandler.emit') 223 def test_child_module_logger_can_override_logging_level(self, mock_emit): 224 # When a child logger's logging level is configured to something that is not 225 # NOTSET, it takes back the logging control from the interactive "root" 226 # logger by not propagating anything. 227 dummy_logger = logging.getLogger('apache_beam.runners.interactive.dummy2') 228 dummy_logger.setLevel(logging.DEBUG) 229 mock_emit.assert_not_called() 230 dummy_logger.debug('debug') 231 # Because the dummy child logger is configured to log at DEBUG level, it 232 # now propagates DEBUG loggings to the interactive "root" logger. 233 mock_emit.assert_called_once() 234 # When the dummy child logger is configured to log at CRITICAL level, it 235 # will only propagate CRITICAL loggings to the interactive "root" logger. 236 dummy_logger.setLevel(logging.CRITICAL) 237 # Error loggings will not be handled now. 238 dummy_logger.error('error') 239 # Emit is not called, so it's still called once. 240 mock_emit.assert_called_once() 241 242 243 @unittest.skipIf( 244 not ie.current_env().is_interactive_ready, 245 '[interactive] dependency is not installed.') 246 @pytest.mark.skipif( 247 not ie.current_env().is_interactive_ready, 248 reason='[interactive] dependency is not installed.') 249 class ProgressIndicatorTest(unittest.TestCase): 250 def setUp(self): 251 ie.new_env() 252 253 @patch('IPython.get_ipython', new_callable=mock_get_ipython) 254 @patch( 255 'apache_beam.runners.interactive.interactive_environment' 256 '.InteractiveEnvironment.is_in_notebook', 257 new_callable=PropertyMock) 258 def test_progress_in_plain_text_when_not_in_notebook( 259 self, mocked_is_in_notebook, unused): 260 mocked_is_in_notebook.return_value = False 261 262 with patch('IPython.display.display') as mocked_display: 263 264 @utils.progress_indicated 265 def progress_indicated_dummy(): 266 mocked_display.assert_any_call('Processing... progress_indicated_dummy') 267 268 progress_indicated_dummy() 269 mocked_display.assert_any_call('Done.') 270 271 @patch('IPython.get_ipython', new_callable=mock_get_ipython) 272 @patch( 273 'apache_beam.runners.interactive.interactive_environment' 274 '.InteractiveEnvironment.is_in_notebook', 275 new_callable=PropertyMock) 276 def test_progress_in_HTML_JS_when_in_notebook( 277 self, mocked_is_in_notebook, unused): 278 mocked_is_in_notebook.return_value = True 279 280 with patch('IPython.display.HTML') as mocked_html,\ 281 patch('IPython.display.Javascript') as mocked_js: 282 with utils.ProgressIndicator('enter', 'exit'): 283 mocked_html.assert_called() 284 mocked_js.assert_called() 285 286 287 @unittest.skipIf( 288 not ie.current_env().is_interactive_ready, 289 '[interactive] dependency is not installed.') 290 class MessagingUtilTest(unittest.TestCase): 291 SAMPLE_DATA = {'a': [1, 2, 3], 'b': 4, 'c': '5', 'd': {'e': 'f'}} 292 293 def setUp(self): 294 ie.new_env() 295 296 def test_as_json_decorator(self): 297 @utils.as_json 298 def dummy(): 299 return MessagingUtilTest.SAMPLE_DATA 300 301 # As of Python 3.6, for the CPython implementation of Python, 302 # dictionaries remember the order of items inserted. 303 self.assertEqual(json.loads(dummy()), MessagingUtilTest.SAMPLE_DATA) 304 305 306 class GeneralUtilTest(unittest.TestCase): 307 def test_pcoll_by_name(self): 308 p = beam.Pipeline() 309 pcoll = p | beam.Create([1]) 310 ib.watch({'p': p, 'pcoll': pcoll}) 311 312 name_to_pcoll = utils.pcoll_by_name() 313 self.assertIn('pcoll', name_to_pcoll) 314 315 def test_cacheables(self): 316 p2 = beam.Pipeline() 317 pcoll2 = p2 | beam.Create([2]) 318 ib.watch({'p2': p2, 'pcoll2': pcoll2}) 319 320 cacheables = utils.cacheables() 321 cacheable_key = Cacheable.from_pcoll('pcoll2', pcoll2).to_key() 322 self.assertIn(cacheable_key, cacheables) 323 324 def test_has_unbounded_source(self): 325 p = beam.Pipeline() 326 ie.current_env().set_cache_manager(InMemoryCache(), p) 327 _ = p | 'ReadUnboundedSource' >> beam.io.ReadFromPubSub( 328 subscription='projects/fake-project/subscriptions/fake_sub') 329 self.assertTrue(utils.has_unbounded_sources(p)) 330 331 def test_not_has_unbounded_source(self): 332 p = beam.Pipeline() 333 ie.current_env().set_cache_manager(InMemoryCache(), p) 334 with tempfile.NamedTemporaryFile(delete=False) as f: 335 f.write(b'test') 336 _ = p | 'ReadBoundedSource' >> beam.io.ReadFromText(f.name) 337 self.assertFalse(utils.has_unbounded_sources(p)) 338 339 def test_find_pcoll_name(self): 340 p = beam.Pipeline() 341 pcoll = p | beam.Create([1, 2, 3]) 342 ib.watch({ 343 'p_test_find_pcoll_name': p, 344 'pcoll_test_find_pcoll_name': pcoll, 345 }) 346 self.assertEqual('pcoll_test_find_pcoll_name', utils.find_pcoll_name(pcoll)) 347 348 def test_create_var_in_main(self): 349 name = 'test_create_var_in_main' 350 value = Record(0, 0, 0) 351 _ = utils.create_var_in_main(name, value) 352 main_session = importlib.import_module('__main__') 353 self.assertIs(getattr(main_session, name, None), value) 354 355 356 @patch( 357 'apache_beam.io.gcp.internal.clients.storage.StorageV1', 358 return_value=MockStorageClient()) 359 @unittest.skipIf(not _http_error_imported, 'http errors are not imported.') 360 class GCSUtilsTest(unittest.TestCase): 361 @patch( 362 'apache_beam.io.gcp.internal.clients.storage.StorageBucketsGetRequest', 363 return_value='test-bucket-not-found') 364 def test_assert_bucket_exists_not_found(self, mock_response, mock_client): 365 with self.assertRaises(ValueError): 366 utils.assert_bucket_exists('') 367 368 @patch( 369 'apache_beam.io.gcp.internal.clients.storage.StorageBucketsGetRequest', 370 return_value='test-bucket-not-verified') 371 def test_assert_bucket_exists_not_verified(self, mock_response, mock_client): 372 from apache_beam.runners.interactive.utils import _LOGGER 373 with self.assertLogs(_LOGGER, level='WARNING'): 374 utils.assert_bucket_exists('') 375 376 @patch( 377 'apache_beam.io.gcp.internal.clients.storage.StorageBucketsGetRequest', 378 return_value='test-bucket-found') 379 def test_assert_bucket_exists_found(self, mock_response, mock_client): 380 utils.assert_bucket_exists('') 381 382 383 class PipelineUtilTest(unittest.TestCase): 384 def test_detect_pipeline_underlying_runner(self): 385 p = beam.Pipeline(InteractiveRunner(underlying_runner=FlinkRunner())) 386 pipeline_runner = utils.detect_pipeline_runner(p) 387 self.assertTrue(isinstance(pipeline_runner, FlinkRunner)) 388 389 def test_detect_pipeline_no_underlying_runner(self): 390 p = beam.Pipeline(InteractiveRunner()) 391 pipeline_runner = utils.detect_pipeline_runner(p) 392 from apache_beam.runners.direct.direct_runner import DirectRunner 393 self.assertTrue(isinstance(pipeline_runner, DirectRunner)) 394 395 def test_detect_pipeline_no_runner(self): 396 pipeline_runner = utils.detect_pipeline_runner(None) 397 self.assertEqual(pipeline_runner, None) 398 399 400 if __name__ == '__main__': 401 unittest.main()