github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/pipeline_verifiers.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/pipeline_verifiers.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """End-to-end test result verifiers
    19  
    20  A set of verifiers that are used in end-to-end tests to verify state/output
    21  of test pipeline job. Customized verifier should extend
    22  `hamcrest.core.base_matcher.BaseMatcher` and override _matches.
    23  """
    24  
    25  # pytype: skip-file
    26  
    27  import logging
    28  import time
    29  
    30  from hamcrest.core.base_matcher import BaseMatcher
    31  
    32  from apache_beam.io.filesystems import FileSystems
    33  from apache_beam.runners.runner import PipelineState
    34  from apache_beam.testing import test_utils as utils
    35  from apache_beam.utils import retry
    36  
    37  __all__ = [
    38      'PipelineStateMatcher',
    39      'FileChecksumMatcher',
    40      'retry_on_io_error_and_server_error',
    41  ]
    42  
    43  try:
    44    from apitools.base.py.exceptions import HttpError
    45  except ImportError:
    46    HttpError = None
    47  
    48  MAX_RETRIES = 4
    49  
    50  _LOGGER = logging.getLogger(__name__)
    51  
    52  
    53  class PipelineStateMatcher(BaseMatcher):
    54    """Matcher that verify pipeline job terminated in expected state
    55  
    56    Matcher compares the actual pipeline terminate state with expected.
    57    By default, `PipelineState.DONE` is used as expected state.
    58    """
    59    def __init__(self, expected_state=PipelineState.DONE):
    60      self.expected_state = expected_state
    61  
    62    def _matches(self, pipeline_result):
    63      return pipeline_result.state == self.expected_state
    64  
    65    def describe_to(self, description):
    66      description \
    67        .append_text("Test pipeline expected terminated in state: ") \
    68        .append_text(self.expected_state)
    69  
    70    def describe_mismatch(self, pipeline_result, mismatch_description):
    71      mismatch_description \
    72        .append_text("Test pipeline job terminated in state: ") \
    73        .append_text(pipeline_result.state)
    74  
    75  
    76  def retry_on_io_error_and_server_error(exception):
    77    """Filter allowing retries on file I/O errors and service error."""
    78    return isinstance(exception, IOError) or \
    79            (HttpError is not None and isinstance(exception, HttpError))
    80  
    81  
    82  class FileChecksumMatcher(BaseMatcher):
    83    """Matcher that verifies file(s) content by comparing file checksum.
    84  
    85    Use apache_beam.io.filebasedsink to fetch file(s) from given path.
    86    File checksum is a hash string computed from content of file(s).
    87    """
    88    def __init__(self, file_path, expected_checksum, sleep_secs=None):
    89      """Initialize a FileChecksumMatcher object
    90  
    91      Args:
    92        file_path : A string that is the full path of output file. This path
    93          can contain globs.
    94        expected_checksum : A hash string that is computed from expected
    95          result.
    96        sleep_secs : Number of seconds to wait before verification start.
    97          Extra time are given to make sure output files are ready on FS.
    98      """
    99      if sleep_secs is not None:
   100        if isinstance(sleep_secs, int):
   101          self.sleep_secs = sleep_secs
   102        else:
   103          raise ValueError(
   104              'Sleep seconds, if received, must be int. '
   105              'But received: %r, %s' % (sleep_secs, type(sleep_secs)))
   106      else:
   107        self.sleep_secs = None
   108  
   109      self.file_path = file_path
   110      self.expected_checksum = expected_checksum
   111  
   112    @retry.with_exponential_backoff(
   113        num_retries=MAX_RETRIES, retry_filter=retry_on_io_error_and_server_error)
   114    def _read_with_retry(self):
   115      """Read path with retry if I/O failed"""
   116      read_lines = []
   117      match_result = FileSystems.match([self.file_path])[0]
   118      matched_path = [f.path for f in match_result.metadata_list]
   119      if not matched_path:
   120        raise IOError('No such file or directory: %s' % self.file_path)
   121  
   122      _LOGGER.info(
   123          'Find %d files in %s: \n%s',
   124          len(matched_path),
   125          self.file_path,
   126          '\n'.join(matched_path))
   127      for path in matched_path:
   128        with FileSystems.open(path, 'r') as f:
   129          for line in f:
   130            read_lines.append(line)
   131      return read_lines
   132  
   133    def _matches(self, _):
   134      if self.sleep_secs:
   135        # Wait to have output file ready on FS
   136        _LOGGER.info('Wait %d seconds...', self.sleep_secs)
   137        time.sleep(self.sleep_secs)
   138  
   139      # Read from given file(s) path
   140      read_lines = self._read_with_retry()
   141  
   142      # Compute checksum
   143      self.checksum = utils.compute_hash(read_lines)
   144      _LOGGER.info(
   145          'Read from given path %s, %d lines, checksum: %s.',
   146          self.file_path,
   147          len(read_lines),
   148          self.checksum)
   149      return self.checksum == self.expected_checksum
   150  
   151    def describe_to(self, description):
   152      description \
   153        .append_text("Expected checksum is ") \
   154        .append_text(self.expected_checksum)
   155  
   156    def describe_mismatch(self, pipeline_result, mismatch_description):
   157      mismatch_description \
   158        .append_text("Actual checksum is ") \
   159        .append_text(self.checksum)