github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/portability/spark_runner_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  # pytype: skip-file
    18  
    19  import argparse
    20  import logging
    21  import shlex
    22  import unittest
    23  from shutil import rmtree
    24  from tempfile import mkdtemp
    25  
    26  import pytest
    27  
    28  from apache_beam.options.pipeline_options import PortableOptions
    29  from apache_beam.runners.portability import job_server
    30  from apache_beam.runners.portability import portable_runner
    31  from apache_beam.runners.portability import portable_runner_test
    32  
    33  # Run as
    34  #
    35  # pytest spark_runner_test.py[::TestClass::test_case] \
    36  #     --test-pipeline-options="--environment_type=LOOPBACK"
    37  
    38  _LOGGER = logging.getLogger(__name__)
    39  
    40  
    41  class SparkRunnerTest(portable_runner_test.PortableRunnerTest):
    42    _use_grpc = True
    43    _use_subprocesses = True
    44  
    45    expansion_port = None
    46    spark_job_server_jar = None
    47  
    48    @pytest.fixture(autouse=True)
    49    def parse_options(self, request):
    50      if not request.config.option.test_pipeline_options:
    51        raise unittest.SkipTest(
    52            'Skipping because --test-pipeline-options is not specified.')
    53      test_pipeline_options = request.config.option.test_pipeline_options
    54      parser = argparse.ArgumentParser(add_help=True)
    55      parser.add_argument(
    56          '--spark_job_server_jar',
    57          help='Job server jar to submit jobs.',
    58          action='store')
    59      parser.add_argument(
    60          '--environment_type',
    61          default='LOOPBACK',
    62          choices=['DOCKER', 'PROCESS', 'LOOPBACK'],
    63          help='Set the environment type for running user code. DOCKER runs '
    64          'user code in a container. PROCESS runs user code in '
    65          'automatically started processes. LOOPBACK runs user code on '
    66          'the same process that originally submitted the job.')
    67      parser.add_argument(
    68          '--environment_option',
    69          '--environment_options',
    70          dest='environment_options',
    71          action='append',
    72          default=None,
    73          help=(
    74              'Environment configuration for running the user code. '
    75              'Recognized options depend on --environment_type.\n '
    76              'For DOCKER: docker_container_image (optional)\n '
    77              'For PROCESS: process_command (required), process_variables '
    78              '(optional, comma-separated)\n '
    79              'For EXTERNAL: external_service_address (required)'))
    80      known_args, unknown_args = parser.parse_known_args(
    81          shlex.split(test_pipeline_options))
    82      if unknown_args:
    83        _LOGGER.warning('Discarding unrecognized arguments %s' % unknown_args)
    84      self.set_spark_job_server_jar(
    85          known_args.spark_job_server_jar or
    86          job_server.JavaJarJobServer.path_to_beam_jar(
    87              ':runners:spark:3:job-server:shadowJar'))
    88      self.environment_type = known_args.environment_type
    89      self.environment_options = known_args.environment_options
    90  
    91    @classmethod
    92    def _subprocess_command(cls, job_port, expansion_port):
    93      # will be cleaned up at the end of this method, and recreated and used by
    94      # the job server
    95      tmp_dir = mkdtemp(prefix='sparktest')
    96  
    97      cls.expansion_port = expansion_port
    98  
    99      try:
   100        return [
   101            'java',
   102            '-Dbeam.spark.test.reuseSparkContext=true',
   103            '-jar',
   104            cls.spark_job_server_jar,
   105            '--spark-master-url',
   106            'local',
   107            '--artifacts-dir',
   108            tmp_dir,
   109            '--job-port',
   110            str(job_port),
   111            '--artifact-port',
   112            '0',
   113            '--expansion-port',
   114            str(expansion_port),
   115        ]
   116      finally:
   117        rmtree(tmp_dir)
   118  
   119    @classmethod
   120    def get_runner(cls):
   121      return portable_runner.PortableRunner()
   122  
   123    @classmethod
   124    def get_expansion_service(cls):
   125      # TODO Move expansion address resides into PipelineOptions
   126      return 'localhost:%s' % cls.expansion_port
   127  
   128    @classmethod
   129    def set_spark_job_server_jar(cls, spark_job_server_jar):
   130      cls.spark_job_server_jar = spark_job_server_jar
   131  
   132    def create_options(self):
   133      options = super().create_options()
   134      options.view_as(PortableOptions).environment_type = self.environment_type
   135      options.view_as(
   136          PortableOptions).environment_options = self.environment_options
   137  
   138      return options
   139  
   140    def test_metrics(self):
   141      # Skip until Spark runner supports metrics.
   142      raise unittest.SkipTest("https://github.com/apache/beam/issues/19496")
   143  
   144    def test_sdf(self):
   145      # Skip until Spark runner supports SDF.
   146      raise unittest.SkipTest("https://github.com/apache/beam/issues/19468")
   147  
   148    def test_sdf_with_watermark_tracking(self):
   149      # Skip until Spark runner supports SDF.
   150      raise unittest.SkipTest("https://github.com/apache/beam/issues/19468")
   151  
   152    def test_sdf_with_sdf_initiated_checkpointing(self):
   153      # Skip until Spark runner supports SDF.
   154      raise unittest.SkipTest("https://github.com/apache/beam/issues/19468")
   155  
   156    def test_sdf_synthetic_source(self):
   157      # Skip until Spark runner supports SDF.
   158      raise unittest.SkipTest("https://github.com/apache/beam/issues/19468")
   159  
   160    def test_callbacks_with_exception(self):
   161      # Skip until Spark runner supports bundle finalization.
   162      raise unittest.SkipTest("https://github.com/apache/beam/issues/19517")
   163  
   164    def test_register_finalizations(self):
   165      # Skip until Spark runner supports bundle finalization.
   166      raise unittest.SkipTest("https://github.com/apache/beam/issues/19517")
   167  
   168    def test_sdf_with_dofn_as_watermark_estimator(self):
   169      # Skip until Spark runner supports SDF and self-checkpoint.
   170      raise unittest.SkipTest("https://github.com/apache/beam/issues/19468")
   171  
   172    def test_pardo_dynamic_timer(self):
   173      raise unittest.SkipTest("https://github.com/apache/beam/issues/20179")
   174  
   175    def test_flattened_side_input(self):
   176      # Blocked on support for transcoding
   177      # https://jira.apache.org/jira/browse/BEAM-7236
   178      super().test_flattened_side_input(with_transcoding=False)
   179  
   180    def test_custom_merging_window(self):
   181      raise unittest.SkipTest("https://github.com/apache/beam/issues/20641")
   182  
   183    # Inherits all other tests from PortableRunnerTest.
   184  
   185  
   186  if __name__ == '__main__':
   187    # Run the tests.
   188    logging.getLogger().setLevel(logging.INFO)
   189    unittest.main()