github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/wordcount_it_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """End-to-end test for the wordcount example.""" 19 20 # pytype: skip-file 21 22 import logging 23 import os 24 import time 25 import unittest 26 27 import pytest 28 from hamcrest.core.core.allof import all_of 29 30 from apache_beam.examples import wordcount 31 from apache_beam.internal.gcp import auth 32 from apache_beam.testing.load_tests.load_test_metrics_utils import InfluxDBMetricsPublisherOptions 33 from apache_beam.testing.load_tests.load_test_metrics_utils import MetricsReader 34 from apache_beam.testing.pipeline_verifiers import FileChecksumMatcher 35 from apache_beam.testing.pipeline_verifiers import PipelineStateMatcher 36 from apache_beam.testing.test_pipeline import TestPipeline 37 from apache_beam.testing.test_utils import delete_files 38 39 40 class WordCountIT(unittest.TestCase): 41 42 # The default checksum is a SHA-1 hash generated from a sorted list of 43 # lines read from expected output. This value corresponds to the default 44 # input of WordCount example. 45 DEFAULT_CHECKSUM = '33535a832b7db6d78389759577d4ff495980b9c0' 46 47 @pytest.mark.it_postcommit 48 def test_wordcount_it(self): 49 self._run_wordcount_it(wordcount.run) 50 51 @pytest.mark.it_postcommit 52 @pytest.mark.sickbay_direct 53 @pytest.mark.sickbay_spark 54 @pytest.mark.sickbay_flink 55 def test_wordcount_impersonation_it(self): 56 """Tests impersonation on dataflow. 57 58 For testing impersonation, we use three ingredients: 59 - a principal to impersonate 60 - a dataflow service account that only that principal is 61 allowed to launch jobs as 62 - a temp root that only the above two accounts have access to 63 64 Jenkins and Dataflow workers both run as GCE default service account. 65 So we remove that account from all the above. 66 """ 67 # Credentials need to be reset or this test will fail and credentials 68 # from a previous test will be used. 69 with auth._Credentials._credentials_lock: 70 auth._Credentials._credentials_init = False 71 try: 72 ACCOUNT_TO_IMPERSONATE = ( 73 'allows-impersonation@apache-' 74 'beam-testing.iam.gserviceaccount.com') 75 RUNNER_ACCOUNT = ( 76 'impersonation-dataflow-worker@' 77 'apache-beam-testing.iam.gserviceaccount.com') 78 TEMP_DIR = 'gs://impersonation-test-bucket/temp-it' 79 STAGING_LOCATION = 'gs://impersonation-test-bucket/staging-it' 80 extra_options = { 81 'impersonate_service_account': ACCOUNT_TO_IMPERSONATE, 82 'service_account_email': RUNNER_ACCOUNT, 83 'temp_location': TEMP_DIR, 84 'staging_location': STAGING_LOCATION 85 } 86 self._run_wordcount_it(wordcount.run, **extra_options) 87 finally: 88 # Reset credentials for future tests. 89 with auth._Credentials._credentials_lock: 90 auth._Credentials._credentials_init = False 91 92 @pytest.mark.it_postcommit 93 @pytest.mark.it_validatescontainer 94 def test_wordcount_fnapi_it(self): 95 self._run_wordcount_it(wordcount.run, experiment='beam_fn_api') 96 97 @pytest.mark.it_validatescontainer 98 def test_wordcount_it_with_prebuilt_sdk_container_local_docker(self): 99 self._run_wordcount_it( 100 wordcount.run, 101 experiment='beam_fn_api', 102 prebuild_sdk_container_engine='local_docker') 103 104 @pytest.mark.it_validatescontainer 105 def test_wordcount_it_with_prebuilt_sdk_container_cloud_build(self): 106 self._run_wordcount_it( 107 wordcount.run, 108 experiment='beam_fn_api', 109 prebuild_sdk_container_engine='cloud_build') 110 111 @pytest.mark.it_validatescontainer 112 def test_wordcount_it_with_use_sibling_sdk_workers(self): 113 self._run_wordcount_it(wordcount.run, experiment='use_sibling_sdk_workers') 114 115 def _run_wordcount_it(self, run_wordcount, **opts): 116 test_pipeline = TestPipeline(is_integration_test=True) 117 extra_opts = {} 118 119 # Set extra options to the pipeline for test purpose 120 test_output = '/'.join([ 121 test_pipeline.get_option('output'), 122 str(int(time.time() * 1000)), 123 'results' 124 ]) 125 extra_opts['output'] = test_output 126 127 test_input = test_pipeline.get_option('input') 128 if test_input: 129 extra_opts['input'] = test_input 130 131 arg_sleep_secs = test_pipeline.get_option('sleep_secs') 132 sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None 133 expect_checksum = ( 134 test_pipeline.get_option('expect_checksum') or self.DEFAULT_CHECKSUM) 135 pipeline_verifiers = [ 136 PipelineStateMatcher(), 137 FileChecksumMatcher( 138 test_output + '*-of-*', expect_checksum, sleep_secs) 139 ] 140 extra_opts['on_success_matcher'] = all_of(*pipeline_verifiers) 141 extra_opts.update(opts) 142 143 # Register clean up before pipeline execution 144 self.addCleanup(delete_files, [test_output + '*']) 145 146 publish_to_bq = bool(test_pipeline.get_option('publish_to_big_query')) 147 148 # Start measure time for performance test 149 start_time = time.time() 150 151 # Get pipeline options from command argument: --test-pipeline-options, 152 # and start pipeline job by calling pipeline main function. 153 run_wordcount( 154 test_pipeline.get_full_options_as_args(**extra_opts), 155 save_main_session=False, 156 ) 157 158 end_time = time.time() 159 run_time = end_time - start_time 160 161 if publish_to_bq: 162 self._publish_metrics(test_pipeline, run_time) 163 164 def _publish_metrics(self, pipeline, metric_value): 165 influx_options = InfluxDBMetricsPublisherOptions( 166 pipeline.get_option('influx_measurement'), 167 pipeline.get_option('influx_db_name'), 168 pipeline.get_option('influx_hostname'), 169 os.getenv('INFLUXDB_USER'), 170 os.getenv('INFLUXDB_USER_PASSWORD'), 171 ) 172 metric_reader = MetricsReader( 173 project_name=pipeline.get_option('project'), 174 bq_table=pipeline.get_option('metrics_table'), 175 bq_dataset=pipeline.get_option('metrics_dataset'), 176 publish_to_bq=True, 177 influxdb_options=influx_options, 178 ) 179 180 metric_reader.publish_values([( 181 'runtime', 182 metric_value, 183 )]) 184 185 186 if __name__ == '__main__': 187 logging.getLogger().setLevel(logging.DEBUG) 188 unittest.main()