github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/dataflow/test_dataflow_runner.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Wrapper of Beam runners that's built for running and verifying e2e tests."""
    19  
    20  # pytype: skip-file
    21  
    22  import logging
    23  import time
    24  
    25  from apache_beam.internal import pickler
    26  from apache_beam.options.pipeline_options import GoogleCloudOptions
    27  from apache_beam.options.pipeline_options import StandardOptions
    28  from apache_beam.options.pipeline_options import TestOptions
    29  from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner
    30  from apache_beam.runners.runner import PipelineState
    31  
    32  __all__ = ['TestDataflowRunner']
    33  
    34  # Dataflow take up to 10mins for the long tail of starting/stopping worker
    35  # pool.
    36  WAIT_IN_STATE_TIMEOUT = 10 * 60
    37  
    38  _LOGGER = logging.getLogger(__name__)
    39  
    40  
    41  class TestDataflowRunner(DataflowRunner):
    42    def run_pipeline(self, pipeline, options):
    43      """Execute test pipeline and verify test matcher"""
    44      test_options = options.view_as(TestOptions)
    45      on_success_matcher = test_options.on_success_matcher
    46      wait_duration = test_options.wait_until_finish_duration
    47      is_streaming = options.view_as(StandardOptions).streaming
    48  
    49      # [BEAM-1889] Do not send this to remote workers also, there is no need to
    50      # send this option to remote executors.
    51      test_options.on_success_matcher = None
    52  
    53      self.result = super().run_pipeline(pipeline, options)
    54      if self.result.has_job:
    55        # TODO(markflyhigh)(https://github.com/apache/beam/issues/18254): Use
    56        # print since Nose dosen't show logs in some cases.
    57        print('Worker logs: %s' % self.build_console_url(options))
    58        _LOGGER.info('Console log: ')
    59        _LOGGER.info(self.build_console_url(options))
    60  
    61      try:
    62        self.wait_until_in_state(PipelineState.RUNNING)
    63  
    64        if is_streaming and not wait_duration:
    65          _LOGGER.warning('Waiting indefinitely for streaming job.')
    66        self.result.wait_until_finish(duration=wait_duration)
    67  
    68        if on_success_matcher:
    69          from hamcrest import assert_that as hc_assert_that
    70          hc_assert_that(self.result, pickler.loads(on_success_matcher))
    71      finally:
    72        if not self.result.is_in_terminal_state():
    73          self.result.cancel()
    74          self.wait_until_in_state(PipelineState.CANCELLED)
    75  
    76      return self.result
    77  
    78    def build_console_url(self, options):
    79      """Build a console url of Dataflow job."""
    80      project = options.view_as(GoogleCloudOptions).project
    81      region_id = options.view_as(GoogleCloudOptions).region
    82      job_id = self.result.job_id()
    83      return (
    84          'https://console.cloud.google.com/dataflow/jobs/%s/%s?project=%s' %
    85          (region_id, job_id, project))
    86  
    87    def wait_until_in_state(self, expected_state, timeout=WAIT_IN_STATE_TIMEOUT):
    88      """Wait until Dataflow pipeline enters a certain state."""
    89      consoleUrl = (
    90          "Console URL: https://console.cloud.google.com/dataflow/"
    91          f"<regionId>/{self.result.job_id()}?project=<projectId>")
    92      if not self.result.has_job:
    93        _LOGGER.error(consoleUrl)
    94        raise IOError('Failed to get the Dataflow job id.')
    95  
    96      start_time = time.time()
    97      while time.time() - start_time <= timeout:
    98        job_state = self.result.state
    99        if self.result.is_in_terminal_state() or job_state == expected_state:
   100          return job_state
   101        time.sleep(5)
   102      _LOGGER.error(consoleUrl)
   103      raise RuntimeError(
   104          'Timeout after %d seconds while waiting for job %s '
   105          'enters expected state %s. Current state is %s.' %
   106          (timeout, self.result.job_id(), expected_state, self.result.state))