github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/pipeline_verifiers.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """End-to-end test result verifiers 19 20 A set of verifiers that are used in end-to-end tests to verify state/output 21 of test pipeline job. Customized verifier should extend 22 `hamcrest.core.base_matcher.BaseMatcher` and override _matches. 23 """ 24 25 # pytype: skip-file 26 27 import logging 28 import time 29 30 from hamcrest.core.base_matcher import BaseMatcher 31 32 from apache_beam.io.filesystems import FileSystems 33 from apache_beam.runners.runner import PipelineState 34 from apache_beam.testing import test_utils as utils 35 from apache_beam.utils import retry 36 37 __all__ = [ 38 'PipelineStateMatcher', 39 'FileChecksumMatcher', 40 'retry_on_io_error_and_server_error', 41 ] 42 43 try: 44 from apitools.base.py.exceptions import HttpError 45 except ImportError: 46 HttpError = None 47 48 MAX_RETRIES = 4 49 50 _LOGGER = logging.getLogger(__name__) 51 52 53 class PipelineStateMatcher(BaseMatcher): 54 """Matcher that verify pipeline job terminated in expected state 55 56 Matcher compares the actual pipeline terminate state with expected. 57 By default, `PipelineState.DONE` is used as expected state. 58 """ 59 def __init__(self, expected_state=PipelineState.DONE): 60 self.expected_state = expected_state 61 62 def _matches(self, pipeline_result): 63 return pipeline_result.state == self.expected_state 64 65 def describe_to(self, description): 66 description \ 67 .append_text("Test pipeline expected terminated in state: ") \ 68 .append_text(self.expected_state) 69 70 def describe_mismatch(self, pipeline_result, mismatch_description): 71 mismatch_description \ 72 .append_text("Test pipeline job terminated in state: ") \ 73 .append_text(pipeline_result.state) 74 75 76 def retry_on_io_error_and_server_error(exception): 77 """Filter allowing retries on file I/O errors and service error.""" 78 return isinstance(exception, IOError) or \ 79 (HttpError is not None and isinstance(exception, HttpError)) 80 81 82 class FileChecksumMatcher(BaseMatcher): 83 """Matcher that verifies file(s) content by comparing file checksum. 84 85 Use apache_beam.io.filebasedsink to fetch file(s) from given path. 86 File checksum is a hash string computed from content of file(s). 87 """ 88 def __init__(self, file_path, expected_checksum, sleep_secs=None): 89 """Initialize a FileChecksumMatcher object 90 91 Args: 92 file_path : A string that is the full path of output file. This path 93 can contain globs. 94 expected_checksum : A hash string that is computed from expected 95 result. 96 sleep_secs : Number of seconds to wait before verification start. 97 Extra time are given to make sure output files are ready on FS. 98 """ 99 if sleep_secs is not None: 100 if isinstance(sleep_secs, int): 101 self.sleep_secs = sleep_secs 102 else: 103 raise ValueError( 104 'Sleep seconds, if received, must be int. ' 105 'But received: %r, %s' % (sleep_secs, type(sleep_secs))) 106 else: 107 self.sleep_secs = None 108 109 self.file_path = file_path 110 self.expected_checksum = expected_checksum 111 112 @retry.with_exponential_backoff( 113 num_retries=MAX_RETRIES, retry_filter=retry_on_io_error_and_server_error) 114 def _read_with_retry(self): 115 """Read path with retry if I/O failed""" 116 read_lines = [] 117 match_result = FileSystems.match([self.file_path])[0] 118 matched_path = [f.path for f in match_result.metadata_list] 119 if not matched_path: 120 raise IOError('No such file or directory: %s' % self.file_path) 121 122 _LOGGER.info( 123 'Find %d files in %s: \n%s', 124 len(matched_path), 125 self.file_path, 126 '\n'.join(matched_path)) 127 for path in matched_path: 128 with FileSystems.open(path, 'r') as f: 129 for line in f: 130 read_lines.append(line) 131 return read_lines 132 133 def _matches(self, _): 134 if self.sleep_secs: 135 # Wait to have output file ready on FS 136 _LOGGER.info('Wait %d seconds...', self.sleep_secs) 137 time.sleep(self.sleep_secs) 138 139 # Read from given file(s) path 140 read_lines = self._read_with_retry() 141 142 # Compute checksum 143 self.checksum = utils.compute_hash(read_lines) 144 _LOGGER.info( 145 'Read from given path %s, %d lines, checksum: %s.', 146 self.file_path, 147 len(read_lines), 148 self.checksum) 149 return self.checksum == self.expected_checksum 150 151 def describe_to(self, description): 152 description \ 153 .append_text("Expected checksum is ") \ 154 .append_text(self.expected_checksum) 155 156 def describe_mismatch(self, pipeline_result, mismatch_description): 157 mismatch_description \ 158 .append_text("Actual checksum is ") \ 159 .append_text(self.checksum)