github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/dataflow/test_dataflow_runner.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Wrapper of Beam runners that's built for running and verifying e2e tests.""" 19 20 # pytype: skip-file 21 22 import logging 23 import time 24 25 from apache_beam.internal import pickler 26 from apache_beam.options.pipeline_options import GoogleCloudOptions 27 from apache_beam.options.pipeline_options import StandardOptions 28 from apache_beam.options.pipeline_options import TestOptions 29 from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner 30 from apache_beam.runners.runner import PipelineState 31 32 __all__ = ['TestDataflowRunner'] 33 34 # Dataflow take up to 10mins for the long tail of starting/stopping worker 35 # pool. 36 WAIT_IN_STATE_TIMEOUT = 10 * 60 37 38 _LOGGER = logging.getLogger(__name__) 39 40 41 class TestDataflowRunner(DataflowRunner): 42 def run_pipeline(self, pipeline, options): 43 """Execute test pipeline and verify test matcher""" 44 test_options = options.view_as(TestOptions) 45 on_success_matcher = test_options.on_success_matcher 46 wait_duration = test_options.wait_until_finish_duration 47 is_streaming = options.view_as(StandardOptions).streaming 48 49 # [BEAM-1889] Do not send this to remote workers also, there is no need to 50 # send this option to remote executors. 51 test_options.on_success_matcher = None 52 53 self.result = super().run_pipeline(pipeline, options) 54 if self.result.has_job: 55 # TODO(markflyhigh)(https://github.com/apache/beam/issues/18254): Use 56 # print since Nose dosen't show logs in some cases. 57 print('Worker logs: %s' % self.build_console_url(options)) 58 _LOGGER.info('Console log: ') 59 _LOGGER.info(self.build_console_url(options)) 60 61 try: 62 self.wait_until_in_state(PipelineState.RUNNING) 63 64 if is_streaming and not wait_duration: 65 _LOGGER.warning('Waiting indefinitely for streaming job.') 66 self.result.wait_until_finish(duration=wait_duration) 67 68 if on_success_matcher: 69 from hamcrest import assert_that as hc_assert_that 70 hc_assert_that(self.result, pickler.loads(on_success_matcher)) 71 finally: 72 if not self.result.is_in_terminal_state(): 73 self.result.cancel() 74 self.wait_until_in_state(PipelineState.CANCELLED) 75 76 return self.result 77 78 def build_console_url(self, options): 79 """Build a console url of Dataflow job.""" 80 project = options.view_as(GoogleCloudOptions).project 81 region_id = options.view_as(GoogleCloudOptions).region 82 job_id = self.result.job_id() 83 return ( 84 'https://console.cloud.google.com/dataflow/jobs/%s/%s?project=%s' % 85 (region_id, job_id, project)) 86 87 def wait_until_in_state(self, expected_state, timeout=WAIT_IN_STATE_TIMEOUT): 88 """Wait until Dataflow pipeline enters a certain state.""" 89 consoleUrl = ( 90 "Console URL: https://console.cloud.google.com/dataflow/" 91 f"<regionId>/{self.result.job_id()}?project=<projectId>") 92 if not self.result.has_job: 93 _LOGGER.error(consoleUrl) 94 raise IOError('Failed to get the Dataflow job id.') 95 96 start_time = time.time() 97 while time.time() - start_time <= timeout: 98 job_state = self.result.state 99 if self.result.is_in_terminal_state() or job_state == expected_state: 100 return job_state 101 time.sleep(5) 102 _LOGGER.error(consoleUrl) 103 raise RuntimeError( 104 'Timeout after %d seconds while waiting for job %s ' 105 'enters expected state %s. Current state is %s.' % 106 (timeout, self.result.job_id(), expected_state, self.result.state))