github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/interactive_runner.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A runner that allows running of Beam pipelines interactively.
    19  
    20  This module is experimental. No backwards-compatibility guarantees.
    21  """
    22  
    23  # pytype: skip-file
    24  
    25  import logging
    26  from typing import Optional
    27  
    28  import apache_beam as beam
    29  from apache_beam import runners
    30  from apache_beam.options.pipeline_options import FlinkRunnerOptions
    31  from apache_beam.options.pipeline_options import GoogleCloudOptions
    32  from apache_beam.options.pipeline_options import PipelineOptions
    33  from apache_beam.options.pipeline_options import WorkerOptions
    34  from apache_beam.pipeline import PipelineVisitor
    35  from apache_beam.runners.direct import direct_runner
    36  from apache_beam.runners.interactive import interactive_environment as ie
    37  from apache_beam.runners.interactive import pipeline_instrument as inst
    38  from apache_beam.runners.interactive import background_caching_job
    39  from apache_beam.runners.interactive.dataproc.types import ClusterMetadata
    40  from apache_beam.runners.interactive.display import pipeline_graph
    41  from apache_beam.runners.interactive.options import capture_control
    42  from apache_beam.runners.interactive.utils import to_element_list
    43  from apache_beam.runners.interactive.utils import watch_sources
    44  from apache_beam.testing.test_stream_service import TestStreamServiceController
    45  
    46  # size of PCollection samples cached.
    47  SAMPLE_SIZE = 8
    48  
    49  _LOGGER = logging.getLogger(__name__)
    50  
    51  
    52  class InteractiveRunner(runners.PipelineRunner):
    53    """An interactive runner for Beam Python pipelines.
    54  
    55    Allows interactively building and running Beam Python pipelines.
    56    """
    57    def __init__(
    58        self,
    59        underlying_runner=None,
    60        render_option=None,
    61        skip_display=True,
    62        force_compute=True,
    63        blocking=True):
    64      """Constructor of InteractiveRunner.
    65  
    66      Args:
    67        underlying_runner: (runner.PipelineRunner)
    68        render_option: (str) this parameter decides how the pipeline graph is
    69            rendered. See display.pipeline_graph_renderer for available options.
    70        skip_display: (bool) whether to skip display operations when running the
    71            pipeline. Useful if running large pipelines when display is not
    72            needed.
    73        force_compute: (bool) whether sequential pipeline runs can use cached data
    74            of PCollections computed from the previous runs including show API
    75            invocation from interactive_beam module. If True, always run the whole
    76            pipeline and compute data for PCollections forcefully. If False, use
    77            available data and run minimum pipeline fragment to only compute data
    78            not available.
    79        blocking: (bool) whether the pipeline run should be blocking or not.
    80      """
    81      self._underlying_runner = (
    82          underlying_runner or direct_runner.DirectRunner())
    83      self._render_option = render_option
    84      self._in_session = False
    85      self._skip_display = skip_display
    86      self._force_compute = force_compute
    87      self._blocking = blocking
    88  
    89    def is_fnapi_compatible(self):
    90      # TODO(https://github.com/apache/beam/issues/19937):
    91      # return self._underlying_runner.is_fnapi_compatible()
    92      return False
    93  
    94    def set_render_option(self, render_option):
    95      """Sets the rendering option.
    96  
    97      Args:
    98        render_option: (str) this parameter decides how the pipeline graph is
    99            rendered. See display.pipeline_graph_renderer for available options.
   100      """
   101      self._render_option = render_option
   102  
   103    def start_session(self):
   104      """Start the session that keeps back-end managers and workers alive.
   105      """
   106      if self._in_session:
   107        return
   108  
   109      enter = getattr(self._underlying_runner, '__enter__', None)
   110      if enter is not None:
   111        _LOGGER.info('Starting session.')
   112        self._in_session = True
   113        enter()
   114      else:
   115        _LOGGER.error('Keep alive not supported.')
   116  
   117    def end_session(self):
   118      """End the session that keeps backend managers and workers alive.
   119      """
   120      if not self._in_session:
   121        return
   122  
   123      exit = getattr(self._underlying_runner, '__exit__', None)
   124      if exit is not None:
   125        self._in_session = False
   126        _LOGGER.info('Ending session.')
   127        exit(None, None, None)
   128  
   129    def apply(self, transform, pvalueish, options):
   130      # TODO(qinyeli, BEAM-646): Remove runner interception of apply.
   131      return self._underlying_runner.apply(transform, pvalueish, options)
   132  
   133    def run_pipeline(self, pipeline, options):
   134      if not ie.current_env().options.enable_recording_replay:
   135        capture_control.evict_captured_data()
   136      if self._force_compute:
   137        ie.current_env().evict_computed_pcollections()
   138  
   139      # Make sure that sources without a user reference are still cached.
   140      watch_sources(pipeline)
   141  
   142      user_pipeline = ie.current_env().user_pipeline(pipeline)
   143  
   144      from apache_beam.runners.portability.flink_runner import FlinkRunner
   145      if isinstance(self._underlying_runner, FlinkRunner):
   146        self.configure_for_flink(user_pipeline, options)
   147  
   148      pipeline_instrument = inst.build_pipeline_instrument(pipeline, options)
   149  
   150      # The user_pipeline analyzed might be None if the pipeline given has nothing
   151      # to be cached and tracing back to the user defined pipeline is impossible.
   152      # When it's None, there is no need to cache including the background
   153      # caching job and no result to track since no background caching job is
   154      # started at all.
   155      if user_pipeline:
   156        # Should use the underlying runner and run asynchronously.
   157        background_caching_job.attempt_to_run_background_caching_job(
   158            self._underlying_runner, user_pipeline, options)
   159        if (background_caching_job.has_source_to_cache(user_pipeline) and
   160            not background_caching_job.is_a_test_stream_service_running(
   161                user_pipeline)):
   162          streaming_cache_manager = ie.current_env().get_cache_manager(
   163              user_pipeline)
   164  
   165          # Only make the server if it doesn't exist already.
   166          if (streaming_cache_manager and
   167              not ie.current_env().get_test_stream_service_controller(
   168                  user_pipeline)):
   169  
   170            def exception_handler(e):
   171              _LOGGER.error(str(e))
   172              return True
   173  
   174            test_stream_service = TestStreamServiceController(
   175                streaming_cache_manager, exception_handler=exception_handler)
   176            test_stream_service.start()
   177            ie.current_env().set_test_stream_service_controller(
   178                user_pipeline, test_stream_service)
   179  
   180      pipeline_to_execute = beam.pipeline.Pipeline.from_runner_api(
   181          pipeline_instrument.instrumented_pipeline_proto(),
   182          self._underlying_runner,
   183          options)
   184  
   185      if ie.current_env().get_test_stream_service_controller(user_pipeline):
   186        endpoint = ie.current_env().get_test_stream_service_controller(
   187            user_pipeline).endpoint
   188  
   189        # TODO: make the StreamingCacheManager and TestStreamServiceController
   190        # constructed when the InteractiveEnvironment is imported.
   191        class TestStreamVisitor(PipelineVisitor):
   192          def visit_transform(self, transform_node):
   193            from apache_beam.testing.test_stream import TestStream
   194            if (isinstance(transform_node.transform, TestStream) and
   195                not transform_node.transform._events):
   196              transform_node.transform._endpoint = endpoint
   197  
   198        pipeline_to_execute.visit(TestStreamVisitor())
   199  
   200      if not self._skip_display:
   201        a_pipeline_graph = pipeline_graph.PipelineGraph(
   202            pipeline_instrument.original_pipeline_proto,
   203            render_option=self._render_option)
   204        a_pipeline_graph.display_graph()
   205  
   206      main_job_result = PipelineResult(
   207          pipeline_to_execute.run(), pipeline_instrument)
   208      # In addition to this pipeline result setting, redundant result setting from
   209      # outer scopes are also recommended since the user_pipeline might not be
   210      # available from within this scope.
   211      if user_pipeline:
   212        ie.current_env().set_pipeline_result(user_pipeline, main_job_result)
   213  
   214      if self._blocking:
   215        main_job_result.wait_until_finish()
   216  
   217      if main_job_result.state is beam.runners.runner.PipelineState.DONE:
   218        # pylint: disable=bad-option-value
   219        ie.current_env().mark_pcollection_computed(
   220            pipeline_instrument.cached_pcolls)
   221  
   222      return main_job_result
   223  
   224    def configure_for_flink(
   225        self, user_pipeline: beam.Pipeline, options: PipelineOptions) -> None:
   226      """Configures the pipeline options for running a job with Flink.
   227  
   228      When running with a FlinkRunner, a job server started from an uber jar
   229      (locally built or remotely downloaded) hosting the beam_job_api will
   230      communicate with the Flink cluster located at the given flink_master in the
   231      pipeline options.
   232      """
   233      clusters = ie.current_env().clusters
   234      if clusters.pipelines.get(user_pipeline, None):
   235        # Noop for a known pipeline using a known Dataproc cluster.
   236        return
   237      flink_master = self._strip_protocol_if_any(
   238          options.view_as(FlinkRunnerOptions).flink_master)
   239      cluster_metadata = clusters.default_cluster_metadata
   240      if flink_master == '[auto]':
   241        # Try to create/reuse a cluster when no flink_master is given.
   242        project_id = options.view_as(GoogleCloudOptions).project
   243        region = options.view_as(GoogleCloudOptions).region or 'us-central1'
   244        if project_id:
   245          if clusters.default_cluster_metadata:
   246            # Reuse the cluster name from default in case of a known cluster.
   247            cluster_metadata = ClusterMetadata(
   248                project_id=project_id,
   249                region=region,
   250                cluster_name=clusters.default_cluster_metadata.cluster_name)
   251          else:
   252            # Generate the metadata with a new unique cluster name.
   253            cluster_metadata = ClusterMetadata(
   254                project_id=project_id, region=region)
   255          # Add additional configurations.
   256          self._worker_options_to_cluster_metadata(options, cluster_metadata)
   257        # else use the default cluster metadata.
   258      elif flink_master in clusters.master_urls:
   259        cluster_metadata = clusters.cluster_metadata(flink_master)
   260      else:  # Noop if a self-hosted Flink is in use.
   261        return
   262      if not cluster_metadata:
   263        return  # Not even a default cluster to create/reuse, run Flink locally.
   264      dcm = clusters.create(cluster_metadata)
   265      # Side effects associated with the user_pipeline.
   266      clusters.pipelines[user_pipeline] = dcm
   267      dcm.pipelines.add(user_pipeline)
   268      self._configure_flink_options(
   269          options,
   270          clusters.DATAPROC_FLINK_VERSION,
   271          dcm.cluster_metadata.master_url)
   272  
   273    def _strip_protocol_if_any(self, flink_master: Optional[str]):
   274      if flink_master:
   275        parts = flink_master.split('://')
   276        if len(parts) > 1:
   277          return parts[1]
   278      return flink_master
   279  
   280    def _worker_options_to_cluster_metadata(
   281        self, options: PipelineOptions, cluster_metadata: ClusterMetadata):
   282      worker_options = options.view_as(WorkerOptions)
   283      if worker_options.subnetwork:
   284        cluster_metadata.subnetwork = worker_options.subnetwork
   285      if worker_options.num_workers:
   286        cluster_metadata.num_workers = worker_options.num_workers
   287      if worker_options.machine_type:
   288        cluster_metadata.machine_type = worker_options.machine_type
   289  
   290    def _configure_flink_options(
   291        self, options: PipelineOptions, flink_version: str, master_url: str):
   292      flink_options = options.view_as(FlinkRunnerOptions)
   293      flink_options.flink_version = flink_version
   294      # flink_options.flink_job_server_jar will be populated by the
   295      # apache_beam.utils.subprocess_server.JavaJarServer.path_to_beam_jar,
   296      # do not populate it explicitly.
   297      flink_options.flink_master = master_url
   298  
   299  
   300  class PipelineResult(beam.runners.runner.PipelineResult):
   301    """Provides access to information about a pipeline."""
   302    def __init__(self, underlying_result, pipeline_instrument):
   303      """Constructor of PipelineResult.
   304  
   305      Args:
   306        underlying_result: (PipelineResult) the result returned by the underlying
   307            runner running the pipeline.
   308        pipeline_instrument: (PipelineInstrument) pipeline instrument describing
   309            the pipeline being executed with interactivity applied and related
   310            metadata including where the interactivity-backing cache lies.
   311      """
   312      super().__init__(underlying_result.state)
   313      self._underlying_result = underlying_result
   314      self._pipeline_instrument = pipeline_instrument
   315  
   316    @property
   317    def state(self):
   318      return self._underlying_result.state
   319  
   320    def wait_until_finish(self):
   321      self._underlying_result.wait_until_finish()
   322  
   323    def get(self, pcoll, include_window_info=False):
   324      """Materializes the PCollection into a list.
   325  
   326      If include_window_info is True, then returns the elements as
   327      WindowedValues. Otherwise, return the element as itself.
   328      """
   329      return list(self.read(pcoll, include_window_info))
   330  
   331    def read(self, pcoll, include_window_info=False):
   332      """Reads the PCollection one element at a time from cache.
   333  
   334      If include_window_info is True, then returns the elements as
   335      WindowedValues. Otherwise, return the element as itself.
   336      """
   337      key = self._pipeline_instrument.cache_key(pcoll)
   338      cache_manager = ie.current_env().get_cache_manager(
   339          self._pipeline_instrument.user_pipeline)
   340      if key and cache_manager.exists('full', key):
   341        coder = cache_manager.load_pcoder('full', key)
   342        reader, _ = cache_manager.read('full', key)
   343        return to_element_list(reader, coder, include_window_info)
   344      else:
   345        raise ValueError('PCollection not available, please run the pipeline.')
   346  
   347    def cancel(self):
   348      self._underlying_result.cancel()