github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/wordcount_it_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/wordcount_it_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """End-to-end test for the wordcount example."""
    19  
    20  # pytype: skip-file
    21  
    22  import logging
    23  import os
    24  import time
    25  import unittest
    26  
    27  import pytest
    28  from hamcrest.core.core.allof import all_of
    29  
    30  from apache_beam.examples import wordcount
    31  from apache_beam.internal.gcp import auth
    32  from apache_beam.testing.load_tests.load_test_metrics_utils import InfluxDBMetricsPublisherOptions
    33  from apache_beam.testing.load_tests.load_test_metrics_utils import MetricsReader
    34  from apache_beam.testing.pipeline_verifiers import FileChecksumMatcher
    35  from apache_beam.testing.pipeline_verifiers import PipelineStateMatcher
    36  from apache_beam.testing.test_pipeline import TestPipeline
    37  from apache_beam.testing.test_utils import delete_files
    38  
    39  
    40  class WordCountIT(unittest.TestCase):
    41  
    42    # The default checksum is a SHA-1 hash generated from a sorted list of
    43    # lines read from expected output. This value corresponds to the default
    44    # input of WordCount example.
    45    DEFAULT_CHECKSUM = '33535a832b7db6d78389759577d4ff495980b9c0'
    46  
    47    @pytest.mark.it_postcommit
    48    def test_wordcount_it(self):
    49      self._run_wordcount_it(wordcount.run)
    50  
    51    @pytest.mark.it_postcommit
    52    @pytest.mark.sickbay_direct
    53    @pytest.mark.sickbay_spark
    54    @pytest.mark.sickbay_flink
    55    def test_wordcount_impersonation_it(self):
    56      """Tests impersonation on dataflow.
    57  
    58      For testing impersonation, we use three ingredients:
    59      - a principal to impersonate
    60      - a dataflow service account that only that principal is
    61        allowed to launch jobs as
    62      - a temp root that only the above two accounts have access to
    63  
    64      Jenkins and Dataflow workers both run as GCE default service account.
    65      So we remove that account from all the above.
    66      """
    67      # Credentials need to be reset or this test will fail and credentials
    68      # from a previous test will be used.
    69      with auth._Credentials._credentials_lock:
    70        auth._Credentials._credentials_init = False
    71      try:
    72        ACCOUNT_TO_IMPERSONATE = (
    73            'allows-impersonation@apache-'
    74            'beam-testing.iam.gserviceaccount.com')
    75        RUNNER_ACCOUNT = (
    76            'impersonation-dataflow-worker@'
    77            'apache-beam-testing.iam.gserviceaccount.com')
    78        TEMP_DIR = 'gs://impersonation-test-bucket/temp-it'
    79        STAGING_LOCATION = 'gs://impersonation-test-bucket/staging-it'
    80        extra_options = {
    81            'impersonate_service_account': ACCOUNT_TO_IMPERSONATE,
    82            'service_account_email': RUNNER_ACCOUNT,
    83            'temp_location': TEMP_DIR,
    84            'staging_location': STAGING_LOCATION
    85        }
    86        self._run_wordcount_it(wordcount.run, **extra_options)
    87      finally:
    88        # Reset credentials for future tests.
    89        with auth._Credentials._credentials_lock:
    90          auth._Credentials._credentials_init = False
    91  
    92    @pytest.mark.it_postcommit
    93    @pytest.mark.it_validatescontainer
    94    def test_wordcount_fnapi_it(self):
    95      self._run_wordcount_it(wordcount.run, experiment='beam_fn_api')
    96  
    97    @pytest.mark.it_validatescontainer
    98    def test_wordcount_it_with_prebuilt_sdk_container_local_docker(self):
    99      self._run_wordcount_it(
   100          wordcount.run,
   101          experiment='beam_fn_api',
   102          prebuild_sdk_container_engine='local_docker')
   103  
   104    @pytest.mark.it_validatescontainer
   105    def test_wordcount_it_with_prebuilt_sdk_container_cloud_build(self):
   106      self._run_wordcount_it(
   107          wordcount.run,
   108          experiment='beam_fn_api',
   109          prebuild_sdk_container_engine='cloud_build')
   110  
   111    @pytest.mark.it_validatescontainer
   112    def test_wordcount_it_with_use_sibling_sdk_workers(self):
   113      self._run_wordcount_it(wordcount.run, experiment='use_sibling_sdk_workers')
   114  
   115    def _run_wordcount_it(self, run_wordcount, **opts):
   116      test_pipeline = TestPipeline(is_integration_test=True)
   117      extra_opts = {}
   118  
   119      # Set extra options to the pipeline for test purpose
   120      test_output = '/'.join([
   121          test_pipeline.get_option('output'),
   122          str(int(time.time() * 1000)),
   123          'results'
   124      ])
   125      extra_opts['output'] = test_output
   126  
   127      test_input = test_pipeline.get_option('input')
   128      if test_input:
   129        extra_opts['input'] = test_input
   130  
   131      arg_sleep_secs = test_pipeline.get_option('sleep_secs')
   132      sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None
   133      expect_checksum = (
   134          test_pipeline.get_option('expect_checksum') or self.DEFAULT_CHECKSUM)
   135      pipeline_verifiers = [
   136          PipelineStateMatcher(),
   137          FileChecksumMatcher(
   138              test_output + '*-of-*', expect_checksum, sleep_secs)
   139      ]
   140      extra_opts['on_success_matcher'] = all_of(*pipeline_verifiers)
   141      extra_opts.update(opts)
   142  
   143      # Register clean up before pipeline execution
   144      self.addCleanup(delete_files, [test_output + '*'])
   145  
   146      publish_to_bq = bool(test_pipeline.get_option('publish_to_big_query'))
   147  
   148      # Start measure time for performance test
   149      start_time = time.time()
   150  
   151      # Get pipeline options from command argument: --test-pipeline-options,
   152      # and start pipeline job by calling pipeline main function.
   153      run_wordcount(
   154          test_pipeline.get_full_options_as_args(**extra_opts),
   155          save_main_session=False,
   156      )
   157  
   158      end_time = time.time()
   159      run_time = end_time - start_time
   160  
   161      if publish_to_bq:
   162        self._publish_metrics(test_pipeline, run_time)
   163  
   164    def _publish_metrics(self, pipeline, metric_value):
   165      influx_options = InfluxDBMetricsPublisherOptions(
   166          pipeline.get_option('influx_measurement'),
   167          pipeline.get_option('influx_db_name'),
   168          pipeline.get_option('influx_hostname'),
   169          os.getenv('INFLUXDB_USER'),
   170          os.getenv('INFLUXDB_USER_PASSWORD'),
   171      )
   172      metric_reader = MetricsReader(
   173          project_name=pipeline.get_option('project'),
   174          bq_table=pipeline.get_option('metrics_table'),
   175          bq_dataset=pipeline.get_option('metrics_dataset'),
   176          publish_to_bq=True,
   177          influxdb_options=influx_options,
   178      )
   179  
   180      metric_reader.publish_values([(
   181          'runtime',
   182          metric_value,
   183      )])
   184  
   185  
   186  if __name__ == '__main__':
   187    logging.getLogger().setLevel(logging.DEBUG)
   188    unittest.main()