github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/tests/bigquery_matcher.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Bigquery data verifier for end-to-end test.""" 19 20 # pytype: skip-file 21 22 import concurrent 23 import logging 24 import time 25 26 from hamcrest.core.base_matcher import BaseMatcher 27 28 from apache_beam.io.gcp import bigquery_tools 29 from apache_beam.testing.test_utils import compute_hash 30 from apache_beam.testing.util import BeamAssertException 31 from apache_beam.testing.util import equal_to 32 from apache_beam.utils import retry 33 34 __all__ = ['BigqueryMatcher', 'BigQueryTableMatcher'] 35 36 # Protect against environments where bigquery library is not available. 37 # pylint: disable=wrong-import-order, wrong-import-position 38 try: 39 from google.cloud import bigquery 40 from google.cloud.exceptions import GoogleCloudError 41 except ImportError: 42 bigquery = None 43 # pylint: enable=wrong-import-order, wrong-import-position 44 45 MAX_RETRIES = 5 46 47 _LOGGER = logging.getLogger(__name__) 48 49 50 def retry_on_http_timeout_and_value_error(exception): 51 """Filter allowing retries on Bigquery errors and value error.""" 52 return isinstance( 53 exception, 54 (GoogleCloudError, ValueError, concurrent.futures.TimeoutError)) 55 56 57 class BigqueryMatcher(BaseMatcher): 58 """Matcher that verifies the checksum of Bigquery data with given query. 59 60 Fetch Bigquery data with given query, compute a hash string and compare 61 with expected checksum. 62 """ 63 def __init__(self, project, query, checksum, timeout_secs=0): 64 """Initialize BigQueryMatcher object. 65 Args: 66 project: The name (string) of the project. 67 query: The query (string) to perform. 68 checksum: SHA-1 hash generated from a sorted list of lines 69 read from expected output. 70 timeout_secs: Duration to retry query until checksum matches. This 71 is useful for DF streaming pipelines or BQ streaming inserts. The 72 default (0) never retries. 73 """ 74 if bigquery is None: 75 raise ImportError('Bigquery dependencies are not installed.') 76 if not query or not isinstance(query, str): 77 raise ValueError('Invalid argument: query. Please use non-empty string') 78 if not checksum or not isinstance(checksum, str): 79 raise ValueError( 80 'Invalid argument: checksum. Please use non-empty string') 81 self.project = project 82 self.query = query 83 self.expected_checksum = checksum 84 self.checksum = None 85 self.timeout_secs = timeout_secs 86 87 def _matches(self, _): 88 @retry.with_exponential_backoff( 89 num_retries=1000, 90 initial_delay_secs=0.5, 91 max_delay_secs=30, 92 stop_after_secs=self.timeout_secs, 93 ) 94 def get_checksum(): 95 response = self._query_with_retry() 96 _LOGGER.info( 97 'Read from given query (%s), total rows %d', 98 self.query, 99 len(response)) 100 self.checksum = compute_hash(response) 101 _LOGGER.info('Generate checksum: %s', self.checksum) 102 if self.checksum != self.expected_checksum: 103 # This exception is never raised beyond the enclosing method. 104 raise ValueError( 105 'Checksums do not match. Expected: %s, got: %s' % 106 (self.expected_checksum, self.checksum)) 107 108 if self.checksum is None: 109 try: 110 get_checksum() 111 except ValueError: 112 pass 113 114 return self.checksum == self.expected_checksum 115 116 @retry.with_exponential_backoff( 117 num_retries=MAX_RETRIES, 118 retry_filter=retry_on_http_timeout_and_value_error) 119 def _query_with_retry(self): 120 """Run Bigquery query with retry if got error http response""" 121 _LOGGER.info('Attempting to perform query %s to BQ', self.query) 122 # Create client here since it throws an exception if pickled. 123 bigquery_client = bigquery.Client(self.project) 124 query_job = bigquery_client.query(self.query) 125 rows = query_job.result(timeout=60) 126 return [row.values() for row in rows] 127 128 def describe_to(self, description): 129 description \ 130 .append_text("Expected checksum is ") \ 131 .append_text(self.expected_checksum) 132 133 def describe_mismatch(self, pipeline_result, mismatch_description): 134 mismatch_description \ 135 .append_text("Actual checksum is ") \ 136 .append_text(self.checksum) 137 138 139 class BigqueryFullResultMatcher(BigqueryMatcher): 140 """Matcher that verifies Bigquery data with given query. 141 142 Fetch Bigquery data with given query, compare to the expected data. 143 """ 144 def __init__(self, project, query, data): 145 """Initialize BigQueryMatcher object. 146 Args: 147 project: The name (string) of the project. 148 query: The query (string) to perform. 149 data: List of tuples with the expected data. 150 """ 151 super().__init__(project, query, 'unused_checksum') 152 self.expected_data = data 153 self.actual_data = None 154 155 def _matches(self, _): 156 if self.actual_data is None: 157 self.actual_data = self._get_query_result() 158 _LOGGER.info('Result of query is: %r', self.actual_data) 159 160 try: 161 equal_to(self.expected_data)(self.actual_data) 162 return True 163 except BeamAssertException: 164 return False 165 166 def _get_query_result(self): 167 return self._query_with_retry() 168 169 def describe_to(self, description): 170 description \ 171 .append_text("Expected data is ") \ 172 .append_text(self.expected_data) 173 174 def describe_mismatch(self, pipeline_result, mismatch_description): 175 mismatch_description \ 176 .append_text("Actual data is ") \ 177 .append_text(self.actual_data) 178 179 180 class BigqueryFullResultStreamingMatcher(BigqueryFullResultMatcher): 181 """ 182 Matcher that verifies Bigquery data with given query. 183 184 Fetch Bigquery data with given query, compare to the expected data. 185 This matcher polls BigQuery until the no. of records in BigQuery is 186 equal to the no. of records in expected data. 187 A timeout can be specified. 188 """ 189 190 DEFAULT_TIMEOUT = 5 * 60 191 192 def __init__(self, project, query, data, timeout=DEFAULT_TIMEOUT): 193 super().__init__(project, query, data) 194 self.timeout = timeout 195 196 def _get_query_result(self): 197 start_time = time.time() 198 while time.time() - start_time <= self.timeout: 199 response = self._query_with_retry() 200 if len(response) >= len(self.expected_data): 201 return response 202 _LOGGER.debug('Query result contains %d rows' % len(response)) 203 time.sleep(1) 204 raise TimeoutError('Timeout exceeded for matcher.') # noqa: F821 205 206 207 class BigQueryTableMatcher(BaseMatcher): 208 """Matcher that verifies the properties of a Table in BigQuery.""" 209 def __init__(self, project, dataset, table, expected_properties): 210 if bigquery is None: 211 raise ImportError('Bigquery dependencies are not installed.') 212 213 self.project = project 214 self.dataset = dataset 215 self.table = table 216 self.expected_properties = expected_properties 217 218 @retry.with_exponential_backoff( 219 num_retries=MAX_RETRIES, 220 retry_filter=retry_on_http_timeout_and_value_error) 221 def _get_table_with_retry(self, bigquery_wrapper): 222 return bigquery_wrapper.get_table(self.project, self.dataset, self.table) 223 224 def _matches(self, _): 225 _LOGGER.info('Start verify Bigquery table properties.') 226 # Run query 227 bigquery_wrapper = bigquery_tools.BigQueryWrapper() 228 229 self.actual_table = self._get_table_with_retry(bigquery_wrapper) 230 231 _LOGGER.info('Table proto is %s', self.actual_table) 232 233 return all( 234 self._match_property(v, self._get_or_none(self.actual_table, k)) for k, 235 v in self.expected_properties.items()) 236 237 @staticmethod 238 def _get_or_none(obj, attr): 239 try: 240 return obj.__getattribute__(attr) 241 except AttributeError: 242 try: 243 return obj.get(attr, None) 244 except TypeError: 245 return None 246 247 @staticmethod 248 def _match_property(expected, actual): 249 _LOGGER.info("Matching %s to %s", expected, actual) 250 if isinstance(expected, dict): 251 return all( 252 BigQueryTableMatcher._match_property( 253 v, BigQueryTableMatcher._get_or_none(actual, k)) for k, 254 v in expected.items()) 255 else: 256 return expected == actual 257 258 def describe_to(self, description): 259 description \ 260 .append_text("Expected table attributes are ") \ 261 .append_text(sorted((k, v) 262 for k, v in self.expected_properties.items())) 263 264 def describe_mismatch(self, pipeline_result, mismatch_description): 265 mismatch_description \ 266 .append_text("Actual table attributes are ") \ 267 .append_text(sorted((k, self._get_or_none(self.actual_table, k)) 268 for k in self.expected_properties))