github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/interactive_runner_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/interactive_runner_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Tests for google3.pipeline.dataflow.python.interactive.interactive_runner.
    19  
    20  This module is experimental. No backwards-compatibility guarantees.
    21  """
    22  
    23  # pytype: skip-file
    24  
    25  import sys
    26  import unittest
    27  from typing import NamedTuple
    28  
    29  import pandas as pd
    30  
    31  import apache_beam as beam
    32  from apache_beam.dataframe.convert import to_dataframe
    33  from apache_beam.options.pipeline_options import FlinkRunnerOptions
    34  from apache_beam.options.pipeline_options import PipelineOptions
    35  from apache_beam.options.pipeline_options import StandardOptions
    36  from apache_beam.options.pipeline_options import WorkerOptions
    37  from apache_beam.runners.direct import direct_runner
    38  from apache_beam.runners.interactive import interactive_beam as ib
    39  from apache_beam.runners.interactive import interactive_environment as ie
    40  from apache_beam.runners.interactive import interactive_runner
    41  from apache_beam.runners.interactive.dataproc.dataproc_cluster_manager import DataprocClusterManager
    42  from apache_beam.runners.interactive.dataproc.types import ClusterMetadata
    43  from apache_beam.runners.interactive.testing.mock_env import isolated_env
    44  from apache_beam.runners.portability.flink_runner import FlinkRunner
    45  from apache_beam.testing.test_stream import TestStream
    46  from apache_beam.transforms.window import GlobalWindow
    47  from apache_beam.transforms.window import IntervalWindow
    48  from apache_beam.utils.timestamp import Timestamp
    49  from apache_beam.utils.windowed_value import PaneInfo
    50  from apache_beam.utils.windowed_value import PaneInfoTiming
    51  from apache_beam.utils.windowed_value import WindowedValue
    52  
    53  
    54  def print_with_message(msg):
    55    def printer(elem):
    56      print(msg, elem)
    57      return elem
    58  
    59    return printer
    60  
    61  
    62  class Record(NamedTuple):
    63    name: str
    64    age: int
    65    height: int
    66  
    67  
    68  @isolated_env
    69  class InteractiveRunnerTest(unittest.TestCase):
    70    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
    71    def test_basic(self):
    72      p = beam.Pipeline(
    73          runner=interactive_runner.InteractiveRunner(
    74              direct_runner.DirectRunner()))
    75      ib.watch({'p': p})
    76      p.run().wait_until_finish()
    77      pc0 = (
    78          p | 'read' >> beam.Create([1, 2, 3])
    79          | 'Print1.1' >> beam.Map(print_with_message('Run1.1')))
    80      pc = pc0 | 'Print1.2' >> beam.Map(print_with_message('Run1.2'))
    81      ib.watch(locals())
    82      p.run().wait_until_finish()
    83      _ = pc | 'Print2' >> beam.Map(print_with_message('Run2'))
    84      p.run().wait_until_finish()
    85      _ = pc0 | 'Print3' >> beam.Map(print_with_message('Run3'))
    86      p.run().wait_until_finish()
    87  
    88    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
    89    def test_wordcount(self):
    90      class WordExtractingDoFn(beam.DoFn):
    91        def process(self, element):
    92          text_line = element.strip()
    93          words = text_line.split()
    94          return words
    95  
    96      p = beam.Pipeline(
    97          runner=interactive_runner.InteractiveRunner(
    98              direct_runner.DirectRunner()))
    99  
   100      # Count the occurrences of each word.
   101      counts = (
   102          p
   103          | beam.Create(['to be or not to be that is the question'])
   104          | 'split' >> beam.ParDo(WordExtractingDoFn())
   105          | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
   106          | 'group' >> beam.GroupByKey()
   107          | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1]))))
   108  
   109      # Watch the local scope for Interactive Beam so that counts will be cached.
   110      ib.watch(locals())
   111  
   112      # This is normally done in the interactive_utils when a transform is
   113      # applied but needs an IPython environment. So we manually run this here.
   114      ie.current_env().track_user_pipelines()
   115  
   116      result = p.run()
   117      result.wait_until_finish()
   118  
   119      actual = list(result.get(counts))
   120      self.assertSetEqual(
   121          set(actual),
   122          set([
   123              ('or', 1),
   124              ('that', 1),
   125              ('be', 2),
   126              ('is', 1),
   127              ('question', 1),
   128              ('to', 2),
   129              ('the', 1),
   130              ('not', 1),
   131          ]))
   132  
   133      # Truncate the precision to millis because the window coder uses millis
   134      # as units then gets upcast to micros.
   135      end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000
   136      df_counts = ib.collect(counts, include_window_info=True, n=10)
   137      df_expected = pd.DataFrame({
   138          0: [e[0] for e in actual],
   139          1: [e[1] for e in actual],
   140          'event_time': [end_of_window for _ in actual],
   141          'windows': [[GlobalWindow()] for _ in actual],
   142          'pane_info': [
   143              PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0) for _ in actual
   144          ]
   145      },
   146                                 columns=[
   147                                     0, 1, 'event_time', 'windows', 'pane_info'
   148                                 ])
   149  
   150      pd.testing.assert_frame_equal(df_expected, df_counts)
   151  
   152      actual_reified = result.get(counts, include_window_info=True)
   153      expected_reified = [
   154          WindowedValue(
   155              e,
   156              Timestamp(micros=end_of_window), [GlobalWindow()],
   157              PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)) for e in actual
   158      ]
   159      self.assertEqual(actual_reified, expected_reified)
   160  
   161    def test_streaming_wordcount(self):
   162      class WordExtractingDoFn(beam.DoFn):
   163        def process(self, element):
   164          text_line = element.strip()
   165          words = text_line.split()
   166          return words
   167  
   168      # Add the TestStream so that it can be cached.
   169      ib.options.recordable_sources.add(TestStream)
   170  
   171      p = beam.Pipeline(
   172          runner=interactive_runner.InteractiveRunner(),
   173          options=StandardOptions(streaming=True))
   174  
   175      data = (
   176          p
   177          | TestStream()
   178              .advance_watermark_to(0)
   179              .advance_processing_time(1)
   180              .add_elements(['to', 'be', 'or', 'not', 'to', 'be'])
   181              .advance_watermark_to(20)
   182              .advance_processing_time(1)
   183              .add_elements(['that', 'is', 'the', 'question'])
   184              .advance_watermark_to(30)
   185              .advance_processing_time(1)
   186              .advance_watermark_to(40)
   187              .advance_processing_time(1)
   188              .advance_watermark_to(50)
   189              .advance_processing_time(1)
   190          | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable
   191  
   192      counts = (
   193          data
   194          | 'split' >> beam.ParDo(WordExtractingDoFn())
   195          | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
   196          | 'group' >> beam.GroupByKey()
   197          | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1]))))
   198  
   199      # Watch the local scope for Interactive Beam so that referenced PCollections
   200      # will be cached.
   201      ib.watch(locals())
   202  
   203      # This is normally done in the interactive_utils when a transform is
   204      # applied but needs an IPython environment. So we manually run this here.
   205      ie.current_env().track_user_pipelines()
   206  
   207      # This tests that the data was correctly cached.
   208      pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0)
   209      expected_data_df = pd.DataFrame([
   210          ('to', 0, [IntervalWindow(0, 10)], pane_info),
   211          ('be', 0, [IntervalWindow(0, 10)], pane_info),
   212          ('or', 0, [IntervalWindow(0, 10)], pane_info),
   213          ('not', 0, [IntervalWindow(0, 10)], pane_info),
   214          ('to', 0, [IntervalWindow(0, 10)], pane_info),
   215          ('be', 0, [IntervalWindow(0, 10)], pane_info),
   216          ('that', 20000000, [IntervalWindow(20, 30)], pane_info),
   217          ('is', 20000000, [IntervalWindow(20, 30)], pane_info),
   218          ('the', 20000000, [IntervalWindow(20, 30)], pane_info),
   219          ('question', 20000000, [IntervalWindow(20, 30)], pane_info)
   220      ], columns=[0, 'event_time', 'windows', 'pane_info']) # yapf: disable
   221  
   222      data_df = ib.collect(data, n=10, include_window_info=True)
   223      pd.testing.assert_frame_equal(expected_data_df, data_df)
   224  
   225      # This tests that the windowing was passed correctly so that all the data
   226      # is aggregated also correctly.
   227      pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0)
   228      expected_counts_df = pd.DataFrame([
   229          ('be', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
   230          ('not', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
   231          ('or', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
   232          ('to', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
   233          ('is', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
   234          ('question', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
   235          ('that', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
   236          ('the', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
   237      ], columns=[0, 1, 'event_time', 'windows', 'pane_info']) # yapf: disable
   238  
   239      counts_df = ib.collect(counts, n=8, include_window_info=True)
   240  
   241      # The group by key has no guarantee of order. So we post-process the DF by
   242      # sorting so we can test equality.
   243      sorted_counts_df = (counts_df
   244                          .sort_values(['event_time', 0], ascending=True)
   245                          .reset_index(drop=True)) # yapf: disable
   246      pd.testing.assert_frame_equal(expected_counts_df, sorted_counts_df)
   247  
   248    def test_session(self):
   249      class MockPipelineRunner(object):
   250        def __init__(self):
   251          self._in_session = False
   252  
   253        def __enter__(self):
   254          self._in_session = True
   255  
   256        def __exit__(self, exc_type, exc_val, exc_tb):
   257          self._in_session = False
   258  
   259      underlying_runner = MockPipelineRunner()
   260      runner = interactive_runner.InteractiveRunner(underlying_runner)
   261      runner.start_session()
   262      self.assertTrue(underlying_runner._in_session)
   263      runner.end_session()
   264      self.assertFalse(underlying_runner._in_session)
   265  
   266    @unittest.skipIf(
   267        not ie.current_env().is_interactive_ready,
   268        '[interactive] dependency is not installed.')
   269    def test_mark_pcollection_completed_after_successful_run(self):
   270      with self.cell:  # Cell 1
   271        p = beam.Pipeline(interactive_runner.InteractiveRunner())
   272        ib.watch({'p': p})
   273  
   274      with self.cell:  # Cell 2
   275        # pylint: disable=bad-option-value
   276        init = p | 'Init' >> beam.Create(range(5))
   277  
   278      with self.cell:  # Cell 3
   279        square = init | 'Square' >> beam.Map(lambda x: x * x)
   280        cube = init | 'Cube' >> beam.Map(lambda x: x**3)
   281  
   282      ib.watch(locals())
   283      result = p.run()
   284      self.assertTrue(init in ie.current_env().computed_pcollections)
   285      self.assertEqual({0, 1, 2, 3, 4}, set(result.get(init)))
   286      self.assertTrue(square in ie.current_env().computed_pcollections)
   287      self.assertEqual({0, 1, 4, 9, 16}, set(result.get(square)))
   288      self.assertTrue(cube in ie.current_env().computed_pcollections)
   289      self.assertEqual({0, 1, 8, 27, 64}, set(result.get(cube)))
   290  
   291    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
   292    def test_dataframes(self):
   293      p = beam.Pipeline(
   294          runner=interactive_runner.InteractiveRunner(
   295              direct_runner.DirectRunner()))
   296      data = p | beam.Create(
   297          [1, 2, 3]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x))
   298      df = to_dataframe(data)
   299  
   300      # Watch the local scope for Interactive Beam so that values will be cached.
   301      ib.watch(locals())
   302  
   303      # This is normally done in the interactive_utils when a transform is
   304      # applied but needs an IPython environment. So we manually run this here.
   305      ie.current_env().track_user_pipelines()
   306  
   307      df_expected = pd.DataFrame({'square': [1, 4, 9], 'cube': [1, 8, 27]})
   308      pd.testing.assert_frame_equal(
   309          df_expected, ib.collect(df, n=10).reset_index(drop=True))
   310  
   311    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
   312    def test_dataframes_with_grouped_index(self):
   313      p = beam.Pipeline(
   314          runner=interactive_runner.InteractiveRunner(
   315              direct_runner.DirectRunner()))
   316  
   317      data = [
   318          Record('a', 20, 170),
   319          Record('a', 30, 170),
   320          Record('b', 22, 180),
   321          Record('c', 18, 150)
   322      ]
   323  
   324      aggregate = lambda df: df.groupby('height').mean()
   325  
   326      deferred_df = aggregate(to_dataframe(p | beam.Create(data)))
   327      df_expected = aggregate(pd.DataFrame(data))
   328  
   329      # Watch the local scope for Interactive Beam so that values will be cached.
   330      ib.watch(locals())
   331  
   332      # This is normally done in the interactive_utils when a transform is
   333      # applied but needs an IPython environment. So we manually run this here.
   334      ie.current_env().track_user_pipelines()
   335  
   336      pd.testing.assert_frame_equal(df_expected, ib.collect(deferred_df, n=10))
   337  
   338    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
   339    def test_dataframes_with_multi_index(self):
   340      p = beam.Pipeline(
   341          runner=interactive_runner.InteractiveRunner(
   342              direct_runner.DirectRunner()))
   343  
   344      data = [
   345          Record('a', 20, 170),
   346          Record('a', 30, 170),
   347          Record('b', 22, 180),
   348          Record('c', 18, 150)
   349      ]
   350  
   351      aggregate = lambda df: df.groupby(['name', 'height']).mean()
   352  
   353      deferred_df = aggregate(to_dataframe(p | beam.Create(data)))
   354      df_input = pd.DataFrame(data)
   355      df_input.name = df_input.name.astype(pd.StringDtype())
   356      df_expected = aggregate(df_input)
   357  
   358      # Watch the local scope for Interactive Beam so that values will be cached.
   359      ib.watch(locals())
   360  
   361      # This is normally done in the interactive_utils when a transform is
   362      # applied but needs an IPython environment. So we manually run this here.
   363      ie.current_env().track_user_pipelines()
   364  
   365      pd.testing.assert_frame_equal(df_expected, ib.collect(deferred_df, n=10))
   366  
   367    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
   368    def test_dataframes_with_multi_index_get_result(self):
   369      p = beam.Pipeline(
   370          runner=interactive_runner.InteractiveRunner(
   371              direct_runner.DirectRunner()))
   372  
   373      data = [
   374          Record('a', 20, 170),
   375          Record('a', 30, 170),
   376          Record('b', 22, 180),
   377          Record('c', 18, 150)
   378      ]
   379  
   380      aggregate = lambda df: df.groupby(['name', 'height']).mean()['age']
   381  
   382      deferred_df = aggregate(to_dataframe(p | beam.Create(data)))
   383      df_input = pd.DataFrame(data)
   384      df_input.name = df_input.name.astype(pd.StringDtype())
   385      df_expected = aggregate(df_input)
   386  
   387      # Watch the local scope for Interactive Beam so that values will be cached.
   388      ib.watch(locals())
   389  
   390      # This is normally done in the interactive_utils when a transform is
   391      # applied but needs an IPython environment. So we manually run this here.
   392      ie.current_env().track_user_pipelines()
   393  
   394      pd.testing.assert_series_equal(df_expected, ib.collect(deferred_df, n=10))
   395  
   396    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
   397    def test_dataframes_same_cell_twice(self):
   398      p = beam.Pipeline(
   399          runner=interactive_runner.InteractiveRunner(
   400              direct_runner.DirectRunner()))
   401      data = p | beam.Create(
   402          [1, 2, 3]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x))
   403      df = to_dataframe(data)
   404  
   405      # Watch the local scope for Interactive Beam so that values will be cached.
   406      ib.watch(locals())
   407  
   408      # This is normally done in the interactive_utils when a transform is
   409      # applied but needs an IPython environment. So we manually run this here.
   410      ie.current_env().track_user_pipelines()
   411  
   412      df_expected = pd.DataFrame({'square': [1, 4, 9], 'cube': [1, 8, 27]})
   413      pd.testing.assert_series_equal(
   414          df_expected['square'],
   415          ib.collect(df['square'], n=10).reset_index(drop=True))
   416      pd.testing.assert_series_equal(
   417          df_expected['cube'],
   418          ib.collect(df['cube'], n=10).reset_index(drop=True))
   419  
   420    @unittest.skipIf(
   421        not ie.current_env().is_interactive_ready,
   422        '[interactive] dependency is not installed.')
   423    @unittest.skipIf(sys.platform == "win32", "[BEAM-10627]")
   424    def test_dataframe_caching(self):
   425      # Create a pipeline that exercises the DataFrame API. This will also use
   426      # caching in the background.
   427      with self.cell:  # Cell 1
   428        p = beam.Pipeline(interactive_runner.InteractiveRunner())
   429        ib.watch({'p': p})
   430  
   431      with self.cell:  # Cell 2
   432        data = p | beam.Create([
   433            1, 2, 3
   434        ]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x))
   435  
   436        with beam.dataframe.allow_non_parallel_operations():
   437          df = to_dataframe(data).reset_index(drop=True)
   438  
   439        ib.collect(df)
   440  
   441      with self.cell:  # Cell 3
   442        df['output'] = df['square'] * df['cube']
   443        ib.collect(df)
   444  
   445      with self.cell:  # Cell 4
   446        df['output'] = 0
   447        ib.collect(df)
   448  
   449      # We use a trace through the graph to perform an isomorphism test. The end
   450      # output should look like a linear graph. This indicates that the dataframe
   451      # transform was correctly broken into separate pieces to cache. If caching
   452      # isn't enabled, all the dataframe computation nodes are connected to a
   453      # single shared node.
   454      trace = []
   455  
   456      # Only look at the top-level transforms for the isomorphism. The test
   457      # doesn't care about the transform implementations, just the overall shape.
   458      class TopLevelTracer(beam.pipeline.PipelineVisitor):
   459        def _find_root_producer(self, node: beam.pipeline.AppliedPTransform):
   460          if node is None or not node.full_label:
   461            return None
   462  
   463          parent = self._find_root_producer(node.parent)
   464          if parent is None:
   465            return node
   466  
   467          return parent
   468  
   469        def _add_to_trace(self, node, trace):
   470          if '/' not in str(node):
   471            if node.inputs:
   472              producer = self._find_root_producer(node.inputs[0].producer)
   473              producer_name = producer.full_label if producer else ''
   474              trace.append((producer_name, node.full_label))
   475  
   476        def visit_transform(self, node: beam.pipeline.AppliedPTransform):
   477          self._add_to_trace(node, trace)
   478  
   479        def enter_composite_transform(
   480            self, node: beam.pipeline.AppliedPTransform):
   481          self._add_to_trace(node, trace)
   482  
   483      p.visit(TopLevelTracer())
   484  
   485      # Do the isomorphism test which states that the topological sort of the
   486      # graph yields a linear graph.
   487      trace_string = '\n'.join(str(t) for t in trace)
   488      prev_producer = ''
   489      for producer, consumer in trace:
   490        self.assertEqual(producer, prev_producer, trace_string)
   491        prev_producer = consumer
   492  
   493  
   494  @unittest.skipIf(
   495      not ie.current_env().is_interactive_ready,
   496      '[interactive] dependency is not installed.')
   497  @isolated_env
   498  class ConfigForFlinkTest(unittest.TestCase):
   499    def setUp(self):
   500      self.current_env.options.cache_root = 'gs://fake'
   501  
   502    def tearDown(self):
   503      self.current_env.options.cache_root = None
   504  
   505    def test_create_a_new_cluster_for_a_new_pipeline(self):
   506      clusters = self.current_env.clusters
   507      runner = interactive_runner.InteractiveRunner(
   508          underlying_runner=FlinkRunner())
   509      options = PipelineOptions(project='test-project', region='test-region')
   510      p = beam.Pipeline(runner=runner, options=options)
   511      runner.configure_for_flink(p, options)
   512  
   513      # Fetch the metadata and assert all side effects.
   514      meta = clusters.cluster_metadata(p)
   515      # The metadata should have all fields populated.
   516      self.assertEqual(meta.project_id, 'test-project')
   517      self.assertEqual(meta.region, 'test-region')
   518      self.assertTrue(meta.cluster_name.startswith('interactive-beam-'))
   519      self.assertTrue(meta.master_url.startswith('test-url'))
   520      self.assertEqual(meta.dashboard, 'test-dashboard')
   521      # The cluster is known now.
   522      self.assertIn(meta, clusters.dataproc_cluster_managers)
   523      self.assertIn(meta.master_url, clusters.master_urls)
   524      self.assertIn(p, clusters.pipelines)
   525      # The default cluster is updated to the created cluster.
   526      self.assertIs(meta, clusters.default_cluster_metadata)
   527      # The pipeline options is tuned for execution on the cluster.
   528      flink_options = options.view_as(FlinkRunnerOptions)
   529      self.assertEqual(flink_options.flink_master, meta.master_url)
   530      self.assertEqual(
   531          flink_options.flink_version, clusters.DATAPROC_FLINK_VERSION)
   532  
   533    def test_reuse_a_cluster_for_a_known_pipeline(self):
   534      clusters = self.current_env.clusters
   535      runner = interactive_runner.InteractiveRunner(
   536          underlying_runner=FlinkRunner())
   537      options = PipelineOptions(project='test-project', region='test-region')
   538      p = beam.Pipeline(runner=runner, options=options)
   539      meta = ClusterMetadata(project_id='test-project', region='test-region')
   540      dcm = DataprocClusterManager(meta)
   541      # Configure the clusters so that the pipeline is known.
   542      clusters.pipelines[p] = dcm
   543      runner.configure_for_flink(p, options)
   544  
   545      # A known cluster is reused.
   546      tuned_meta = clusters.cluster_metadata(p)
   547      self.assertIs(tuned_meta, meta)
   548  
   549    def test_reuse_a_known_cluster_for_unknown_pipeline(self):
   550      clusters = self.current_env.clusters
   551      runner = interactive_runner.InteractiveRunner(
   552          underlying_runner=FlinkRunner())
   553      options = PipelineOptions(project='test-project', region='test-region')
   554      p = beam.Pipeline(runner=runner, options=options)
   555      meta = ClusterMetadata(project_id='test-project', region='test-region')
   556      dcm = DataprocClusterManager(meta)
   557      # Configure the clusters so that the cluster is known.
   558      clusters.dataproc_cluster_managers[meta] = dcm
   559      clusters.set_default_cluster(meta)
   560      runner.configure_for_flink(p, options)
   561  
   562      # A known cluster is reused.
   563      tuned_meta = clusters.cluster_metadata(p)
   564      self.assertIs(tuned_meta, meta)
   565      # The pipeline is known.
   566      self.assertIn(p, clusters.pipelines)
   567      registered_dcm = clusters.pipelines[p]
   568      self.assertIn(p, registered_dcm.pipelines)
   569  
   570    def test_reuse_default_cluster_if_not_configured(self):
   571      clusters = self.current_env.clusters
   572      runner = interactive_runner.InteractiveRunner(
   573          underlying_runner=FlinkRunner())
   574      options = PipelineOptions()
   575      # Pipeline is not configured to run on Cloud.
   576      p = beam.Pipeline(runner=runner, options=options)
   577      meta = ClusterMetadata(project_id='test-project', region='test-region')
   578      meta.master_url = 'test-url'
   579      meta.dashboard = 'test-dashboard'
   580      dcm = DataprocClusterManager(meta)
   581      # Configure the clusters so that a default cluster is known.
   582      clusters.dataproc_cluster_managers[meta] = dcm
   583      clusters.set_default_cluster(meta)
   584      runner.configure_for_flink(p, options)
   585  
   586      # The default cluster is used.
   587      tuned_meta = clusters.cluster_metadata(p)
   588      self.assertIs(tuned_meta, clusters.default_cluster_metadata)
   589      # The pipeline is known.
   590      self.assertIn(p, clusters.pipelines)
   591      registered_dcm = clusters.pipelines[p]
   592      self.assertIn(p, registered_dcm.pipelines)
   593      # The pipeline options is tuned for execution on the cluster.
   594      flink_options = options.view_as(FlinkRunnerOptions)
   595      self.assertEqual(flink_options.flink_master, tuned_meta.master_url)
   596      self.assertEqual(
   597          flink_options.flink_version, clusters.DATAPROC_FLINK_VERSION)
   598  
   599    def test_worker_options_to_cluster_metadata(self):
   600      clusters = self.current_env.clusters
   601      runner = interactive_runner.InteractiveRunner(
   602          underlying_runner=FlinkRunner())
   603      options = PipelineOptions(project='test-project', region='test-region')
   604      worker_options = options.view_as(WorkerOptions)
   605      worker_options.num_workers = 2
   606      worker_options.subnetwork = 'test-network'
   607      worker_options.machine_type = 'test-machine-type'
   608      p = beam.Pipeline(runner=runner, options=options)
   609      runner.configure_for_flink(p, options)
   610  
   611      configured_meta = clusters.cluster_metadata(p)
   612      self.assertEqual(configured_meta.num_workers, worker_options.num_workers)
   613      self.assertEqual(configured_meta.subnetwork, worker_options.subnetwork)
   614      self.assertEqual(configured_meta.machine_type, worker_options.machine_type)
   615  
   616    def test_configure_flink_options(self):
   617      clusters = self.current_env.clusters
   618      runner = interactive_runner.InteractiveRunner(
   619          underlying_runner=FlinkRunner())
   620      options = PipelineOptions(project='test-project', region='test-region')
   621      p = beam.Pipeline(runner=runner, options=options)
   622      runner.configure_for_flink(p, options)
   623  
   624      flink_options = options.view_as(FlinkRunnerOptions)
   625      self.assertEqual(
   626          flink_options.flink_version, clusters.DATAPROC_FLINK_VERSION)
   627      self.assertTrue(flink_options.flink_master.startswith('test-url-'))
   628  
   629    def test_configure_flink_options_with_flink_version_overridden(self):
   630      clusters = self.current_env.clusters
   631      runner = interactive_runner.InteractiveRunner(
   632          underlying_runner=FlinkRunner())
   633      options = PipelineOptions(project='test-project', region='test-region')
   634      flink_options = options.view_as(FlinkRunnerOptions)
   635      flink_options.flink_version = 'test-version'
   636      p = beam.Pipeline(runner=runner, options=options)
   637      runner.configure_for_flink(p, options)
   638  
   639      # The version is overridden to the flink version used by the EMR solution,
   640      # currently only 1: Cloud Dataproc.
   641      self.assertEqual(
   642          flink_options.flink_version, clusters.DATAPROC_FLINK_VERSION)
   643  
   644    def test_strip_http_protocol_from_flink_master(self):
   645      runner = interactive_runner.InteractiveRunner(
   646          underlying_runner=FlinkRunner())
   647      stripped = runner._strip_protocol_if_any('https://flink-master')
   648  
   649      self.assertEqual('flink-master', stripped)
   650  
   651    def test_no_strip_from_flink_master(self):
   652      runner = interactive_runner.InteractiveRunner(
   653          underlying_runner=FlinkRunner())
   654      stripped = runner._strip_protocol_if_any('flink-master')
   655  
   656      self.assertEqual('flink-master', stripped)
   657  
   658    def test_no_strip_from_non_flink_master(self):
   659      runner = interactive_runner.InteractiveRunner(
   660          underlying_runner=FlinkRunner())
   661      stripped = runner._strip_protocol_if_any(None)
   662  
   663      self.assertIsNone(stripped)
   664  
   665  
   666  if __name__ == '__main__':
   667    unittest.main()