github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/portability/flink_runner_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  # pytype: skip-file
    18  
    19  import argparse
    20  import logging
    21  import shlex
    22  import typing
    23  import unittest
    24  from os import linesep
    25  from os import path
    26  from os.path import exists
    27  from shutil import rmtree
    28  from tempfile import mkdtemp
    29  
    30  import pytest
    31  
    32  import apache_beam as beam
    33  from apache_beam import Impulse
    34  from apache_beam import Map
    35  from apache_beam.io.external.generate_sequence import GenerateSequence
    36  from apache_beam.io.kafka import ReadFromKafka
    37  from apache_beam.io.kafka import WriteToKafka
    38  from apache_beam.options.pipeline_options import DebugOptions
    39  from apache_beam.options.pipeline_options import FlinkRunnerOptions
    40  from apache_beam.options.pipeline_options import PortableOptions
    41  from apache_beam.options.pipeline_options import StandardOptions
    42  from apache_beam.runners.portability import job_server
    43  from apache_beam.runners.portability import portable_runner
    44  from apache_beam.runners.portability import portable_runner_test
    45  from apache_beam.testing.util import assert_that
    46  from apache_beam.testing.util import equal_to
    47  from apache_beam.transforms.sql import SqlTransform
    48  
    49  # Run as
    50  #
    51  # pytest flink_runner_test.py[::TestClass::test_case] \
    52  #     --test-pipeline-options="--environment_type=LOOPBACK"
    53  
    54  _LOGGER = logging.getLogger(__name__)
    55  
    56  Row = typing.NamedTuple("Row", [("col1", int), ("col2", str)])
    57  beam.coders.registry.register_coder(Row, beam.coders.RowCoder)
    58  
    59  
    60  class FlinkRunnerTest(portable_runner_test.PortableRunnerTest):
    61    _use_grpc = True
    62    _use_subprocesses = True
    63  
    64    conf_dir = None
    65    expansion_port = None
    66    flink_job_server_jar = None
    67  
    68    def __init__(self, *args, **kwargs):
    69      super().__init__(*args, **kwargs)
    70      self.environment_type = None
    71      self.environment_config = None
    72      self.enable_commit = False
    73  
    74    def setUp(self):
    75      self.enable_commit = False
    76  
    77    @pytest.fixture(autouse=True)
    78    def parse_options(self, request):
    79      if not request.config.option.test_pipeline_options:
    80        raise unittest.SkipTest(
    81            'Skipping because --test-pipeline-options is not specified.')
    82      test_pipeline_options = request.config.option.test_pipeline_options
    83      parser = argparse.ArgumentParser(add_help=True)
    84      parser.add_argument(
    85          '--flink_job_server_jar',
    86          help='Job server jar to submit jobs.',
    87          action='store')
    88      parser.add_argument(
    89          '--environment_type',
    90          default='LOOPBACK',
    91          choices=['DOCKER', 'PROCESS', 'LOOPBACK'],
    92          help='Set the environment type for running user code. DOCKER runs '
    93          'user code in a container. PROCESS runs user code in '
    94          'automatically started processes. LOOPBACK runs user code on '
    95          'the same process that originally submitted the job.')
    96      parser.add_argument(
    97          '--environment_option',
    98          '--environment_options',
    99          dest='environment_options',
   100          action='append',
   101          default=None,
   102          help=(
   103              'Environment configuration for running the user code. '
   104              'Recognized options depend on --environment_type.\n '
   105              'For DOCKER: docker_container_image (optional)\n '
   106              'For PROCESS: process_command (required), process_variables '
   107              '(optional, comma-separated)\n '
   108              'For EXTERNAL: external_service_address (required)'))
   109      known_args, unknown_args = parser.parse_known_args(
   110          shlex.split(test_pipeline_options))
   111      if unknown_args:
   112        _LOGGER.warning('Discarding unrecognized arguments %s' % unknown_args)
   113      self.set_flink_job_server_jar(
   114          known_args.flink_job_server_jar or
   115          job_server.JavaJarJobServer.path_to_beam_jar((
   116              ':runners:flink:%s:job-server:shadowJar' %
   117              FlinkRunnerOptions.PUBLISHED_FLINK_VERSIONS[-1])))
   118      self.environment_type = known_args.environment_type
   119      self.environment_options = known_args.environment_options
   120  
   121    @classmethod
   122    def tearDownClass(cls):
   123      if cls.conf_dir and exists(cls.conf_dir):
   124        _LOGGER.info("removing conf dir: %s" % cls.conf_dir)
   125        rmtree(cls.conf_dir)
   126      super().tearDownClass()
   127  
   128    @classmethod
   129    def _create_conf_dir(cls):
   130      """Create (and save a static reference to) a "conf dir", used to provide
   131       metrics configs and verify metrics output
   132  
   133       It gets cleaned up when the suite is done executing"""
   134  
   135      if hasattr(cls, 'conf_dir'):
   136        cls.conf_dir = mkdtemp(prefix='flinktest-conf')
   137  
   138        # path for a FileReporter to write metrics to
   139        cls.test_metrics_path = path.join(cls.conf_dir, 'test-metrics.txt')
   140  
   141        # path to write Flink configuration to
   142        conf_path = path.join(cls.conf_dir, 'flink-conf.yaml')
   143        file_reporter = 'org.apache.beam.runners.flink.metrics.FileReporter'
   144        with open(conf_path, 'w') as f:
   145          f.write(
   146              linesep.join([
   147                  'metrics.reporters: file',
   148                  'metrics.reporter.file.class: %s' % file_reporter,
   149                  'metrics.reporter.file.path: %s' % cls.test_metrics_path,
   150                  'metrics.scope.operator: <operator_name>',
   151              ]))
   152  
   153    @classmethod
   154    def _subprocess_command(cls, job_port, expansion_port):
   155      # will be cleaned up at the end of this method, and recreated and used by
   156      # the job server
   157      tmp_dir = mkdtemp(prefix='flinktest')
   158  
   159      cls._create_conf_dir()
   160      cls.expansion_port = expansion_port
   161  
   162      try:
   163        return [
   164            'java',
   165            '-Dorg.slf4j.simpleLogger.defaultLogLevel=warn',
   166            '-jar',
   167            cls.flink_job_server_jar,
   168            '--flink-master',
   169            '[local]',
   170            '--flink-conf-dir',
   171            cls.conf_dir,
   172            '--artifacts-dir',
   173            tmp_dir,
   174            '--job-port',
   175            str(job_port),
   176            '--artifact-port',
   177            '0',
   178            '--expansion-port',
   179            str(expansion_port),
   180        ]
   181      finally:
   182        rmtree(tmp_dir)
   183  
   184    @classmethod
   185    def get_runner(cls):
   186      return portable_runner.PortableRunner()
   187  
   188    @classmethod
   189    def get_expansion_service(cls):
   190      # TODO Move expansion address resides into PipelineOptions
   191      return 'localhost:%s' % cls.expansion_port
   192  
   193    @classmethod
   194    def set_flink_job_server_jar(cls, flink_job_server_jar):
   195      cls.flink_job_server_jar = flink_job_server_jar
   196  
   197    def create_options(self):
   198      options = super().create_options()
   199      options.view_as(DebugOptions).experiments = ['beam_fn_api']
   200      options._all_options['parallelism'] = 2
   201      options.view_as(PortableOptions).environment_type = self.environment_type
   202      options.view_as(
   203          PortableOptions).environment_options = self.environment_options
   204      if self.enable_commit:
   205        options.view_as(StandardOptions).streaming = True
   206        options._all_options['checkpointing_interval'] = 3000
   207        options._all_options['shutdown_sources_after_idle_ms'] = 60000
   208        options._all_options['number_of_execution_retries'] = 1
   209  
   210      return options
   211  
   212    # Can't read host files from within docker, read a "local" file there.
   213    def test_read(self):
   214      print('name:', __name__)
   215      with self.create_pipeline() as p:
   216        lines = p | beam.io.ReadFromText('/etc/profile')
   217        assert_that(lines, lambda lines: len(lines) > 0)
   218  
   219    def test_no_subtransform_composite(self):
   220      raise unittest.SkipTest("BEAM-4781")
   221  
   222    def test_external_transform(self):
   223      with self.create_pipeline() as p:
   224        res = (
   225            p
   226            | GenerateSequence(
   227                start=1, stop=10, expansion_service=self.get_expansion_service()))
   228  
   229        assert_that(res, equal_to([i for i in range(1, 10)]))
   230  
   231    def test_expand_kafka_read(self):
   232      # We expect to fail here because we do not have a Kafka cluster handy.
   233      # Nevertheless, we check that the transform is expanded by the
   234      # ExpansionService and that the pipeline fails during execution.
   235      with self.assertRaises(Exception) as ctx:
   236        self.enable_commit = True
   237        with self.create_pipeline() as p:
   238          # pylint: disable=expression-not-assigned
   239          (
   240              p
   241              | ReadFromKafka(
   242                  consumer_config={
   243                      'bootstrap.servers': 'notvalid1:7777, notvalid2:3531',
   244                      'group.id': 'any_group'
   245                  },
   246                  topics=['topic1', 'topic2'],
   247                  key_deserializer='org.apache.kafka.'
   248                  'common.serialization.'
   249                  'ByteArrayDeserializer',
   250                  value_deserializer='org.apache.kafka.'
   251                  'common.serialization.'
   252                  'LongDeserializer',
   253                  commit_offset_in_finalize=True,
   254                  timestamp_policy=ReadFromKafka.create_time_policy,
   255                  expansion_service=self.get_expansion_service()))
   256      self.assertTrue(
   257          'No resolvable bootstrap urls given in bootstrap.servers' in str(
   258              ctx.exception),
   259          'Expected to fail due to invalid bootstrap.servers, but '
   260          'failed due to:\n%s' % str(ctx.exception))
   261  
   262    def test_expand_kafka_write(self):
   263      # We just test the expansion but do not execute.
   264      # pylint: disable=expression-not-assigned
   265      (
   266          self.create_pipeline()
   267          | Impulse()
   268          | Map(lambda input: (1, input))
   269          | WriteToKafka(
   270              producer_config={
   271                  'bootstrap.servers': 'localhost:9092, notvalid2:3531'
   272              },
   273              topic='topic1',
   274              key_serializer='org.apache.kafka.'
   275              'common.serialization.'
   276              'LongSerializer',
   277              value_serializer='org.apache.kafka.'
   278              'common.serialization.'
   279              'ByteArraySerializer',
   280              expansion_service=self.get_expansion_service()))
   281  
   282    def test_sql(self):
   283      with self.create_pipeline() as p:
   284        output = (
   285            p
   286            | 'Create' >> beam.Create([Row(x, str(x)) for x in range(5)])
   287            | 'Sql' >> SqlTransform(
   288                """SELECT col1, col2 || '*' || col2 as col2,
   289                      power(col1, 2) as col3
   290               FROM PCOLLECTION
   291            """,
   292                expansion_service=self.get_expansion_service()))
   293        assert_that(
   294            output,
   295            equal_to([(x, '{x}*{x}'.format(x=x), x * x) for x in range(5)]))
   296  
   297    def test_flattened_side_input(self):
   298      # Blocked on support for transcoding
   299      # https://jira.apache.org/jira/browse/BEAM-6523
   300      super().test_flattened_side_input(with_transcoding=False)
   301  
   302    def test_metrics(self):
   303      super().test_metrics(check_gauge=False)
   304  
   305    def test_sdf_with_watermark_tracking(self):
   306      raise unittest.SkipTest("BEAM-2939")
   307  
   308    def test_callbacks_with_exception(self):
   309      raise unittest.SkipTest("https://github.com/apache/beam/issues/19526")
   310  
   311    def test_register_finalizations(self):
   312      raise unittest.SkipTest("https://github.com/apache/beam/issues/19526")
   313  
   314    def test_custom_merging_window(self):
   315      raise unittest.SkipTest("https://github.com/apache/beam/issues/20641")
   316  
   317    # Inherits all other tests.
   318  
   319  
   320  class FlinkRunnerTestOptimized(FlinkRunnerTest):
   321    # TODO: Remove these tests after resolving
   322    #  https://github.com/apache/beam/issues/19422 and enabling
   323    #  PortableRunnerOptimized
   324    def create_options(self):
   325      options = super().create_options()
   326      options.view_as(DebugOptions).experiments = [
   327          'pre_optimize=all'
   328      ] + options.view_as(DebugOptions).experiments
   329      return options
   330  
   331    def test_external_transform(self):
   332      raise unittest.SkipTest("https://github.com/apache/beam/issues/19461")
   333  
   334    def test_expand_kafka_read(self):
   335      raise unittest.SkipTest("https://github.com/apache/beam/issues/19461")
   336  
   337    def test_expand_kafka_write(self):
   338      raise unittest.SkipTest("https://github.com/apache/beam/issues/19461")
   339  
   340    def test_sql(self):
   341      raise unittest.SkipTest("https://github.com/apache/beam/issues/19461")
   342  
   343    def test_pack_combiners(self):
   344      # Stages produced by translations.pack_combiners are fused
   345      # by translations.greedily_fuse, which prevent the stages
   346      # from being detecting using counters by the test.
   347      self._test_pack_combiners(assert_using_counter_names=False)
   348  
   349  
   350  class FlinkRunnerTestStreaming(FlinkRunnerTest):
   351    def create_options(self):
   352      options = super().create_options()
   353      options.view_as(StandardOptions).streaming = True
   354      return options
   355  
   356    def test_callbacks_with_exception(self):
   357      self.enable_commit = True
   358      super().test_callbacks_with_exception()
   359  
   360    def test_register_finalizations(self):
   361      self.enable_commit = True
   362      super().test_register_finalizations()
   363  
   364  
   365  if __name__ == '__main__':
   366    # Run the tests.
   367    logging.getLogger().setLevel(logging.INFO)
   368    unittest.main()