github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/tests/bigquery_matcher.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Bigquery data verifier for end-to-end test."""
    19  
    20  # pytype: skip-file
    21  
    22  import concurrent
    23  import logging
    24  import time
    25  
    26  from hamcrest.core.base_matcher import BaseMatcher
    27  
    28  from apache_beam.io.gcp import bigquery_tools
    29  from apache_beam.testing.test_utils import compute_hash
    30  from apache_beam.testing.util import BeamAssertException
    31  from apache_beam.testing.util import equal_to
    32  from apache_beam.utils import retry
    33  
    34  __all__ = ['BigqueryMatcher', 'BigQueryTableMatcher']
    35  
    36  # Protect against environments where bigquery library is not available.
    37  # pylint: disable=wrong-import-order, wrong-import-position
    38  try:
    39    from google.cloud import bigquery
    40    from google.cloud.exceptions import GoogleCloudError
    41  except ImportError:
    42    bigquery = None
    43  # pylint: enable=wrong-import-order, wrong-import-position
    44  
    45  MAX_RETRIES = 5
    46  
    47  _LOGGER = logging.getLogger(__name__)
    48  
    49  
    50  def retry_on_http_timeout_and_value_error(exception):
    51    """Filter allowing retries on Bigquery errors and value error."""
    52    return isinstance(
    53        exception,
    54        (GoogleCloudError, ValueError, concurrent.futures.TimeoutError))
    55  
    56  
    57  class BigqueryMatcher(BaseMatcher):
    58    """Matcher that verifies the checksum of Bigquery data with given query.
    59  
    60    Fetch Bigquery data with given query, compute a hash string and compare
    61    with expected checksum.
    62    """
    63    def __init__(self, project, query, checksum, timeout_secs=0):
    64      """Initialize BigQueryMatcher object.
    65      Args:
    66        project: The name (string) of the project.
    67        query: The query (string) to perform.
    68        checksum: SHA-1 hash generated from a sorted list of lines
    69          read from expected output.
    70        timeout_secs: Duration to retry query until checksum matches. This
    71          is useful for DF streaming pipelines or BQ streaming inserts. The
    72          default (0) never retries.
    73      """
    74      if bigquery is None:
    75        raise ImportError('Bigquery dependencies are not installed.')
    76      if not query or not isinstance(query, str):
    77        raise ValueError('Invalid argument: query. Please use non-empty string')
    78      if not checksum or not isinstance(checksum, str):
    79        raise ValueError(
    80            'Invalid argument: checksum. Please use non-empty string')
    81      self.project = project
    82      self.query = query
    83      self.expected_checksum = checksum
    84      self.checksum = None
    85      self.timeout_secs = timeout_secs
    86  
    87    def _matches(self, _):
    88      @retry.with_exponential_backoff(
    89          num_retries=1000,
    90          initial_delay_secs=0.5,
    91          max_delay_secs=30,
    92          stop_after_secs=self.timeout_secs,
    93      )
    94      def get_checksum():
    95        response = self._query_with_retry()
    96        _LOGGER.info(
    97            'Read from given query (%s), total rows %d',
    98            self.query,
    99            len(response))
   100        self.checksum = compute_hash(response)
   101        _LOGGER.info('Generate checksum: %s', self.checksum)
   102        if self.checksum != self.expected_checksum:
   103          # This exception is never raised beyond the enclosing method.
   104          raise ValueError(
   105              'Checksums do not match. Expected: %s, got: %s' %
   106              (self.expected_checksum, self.checksum))
   107  
   108      if self.checksum is None:
   109        try:
   110          get_checksum()
   111        except ValueError:
   112          pass
   113  
   114      return self.checksum == self.expected_checksum
   115  
   116    @retry.with_exponential_backoff(
   117        num_retries=MAX_RETRIES,
   118        retry_filter=retry_on_http_timeout_and_value_error)
   119    def _query_with_retry(self):
   120      """Run Bigquery query with retry if got error http response"""
   121      _LOGGER.info('Attempting to perform query %s to BQ', self.query)
   122      # Create client here since it throws an exception if pickled.
   123      bigquery_client = bigquery.Client(self.project)
   124      query_job = bigquery_client.query(self.query)
   125      rows = query_job.result(timeout=60)
   126      return [row.values() for row in rows]
   127  
   128    def describe_to(self, description):
   129      description \
   130        .append_text("Expected checksum is ") \
   131        .append_text(self.expected_checksum)
   132  
   133    def describe_mismatch(self, pipeline_result, mismatch_description):
   134      mismatch_description \
   135        .append_text("Actual checksum is ") \
   136        .append_text(self.checksum)
   137  
   138  
   139  class BigqueryFullResultMatcher(BigqueryMatcher):
   140    """Matcher that verifies Bigquery data with given query.
   141  
   142    Fetch Bigquery data with given query, compare to the expected data.
   143    """
   144    def __init__(self, project, query, data):
   145      """Initialize BigQueryMatcher object.
   146      Args:
   147        project: The name (string) of the project.
   148        query: The query (string) to perform.
   149        data: List of tuples with the expected data.
   150      """
   151      super().__init__(project, query, 'unused_checksum')
   152      self.expected_data = data
   153      self.actual_data = None
   154  
   155    def _matches(self, _):
   156      if self.actual_data is None:
   157        self.actual_data = self._get_query_result()
   158        _LOGGER.info('Result of query is: %r', self.actual_data)
   159  
   160      try:
   161        equal_to(self.expected_data)(self.actual_data)
   162        return True
   163      except BeamAssertException:
   164        return False
   165  
   166    def _get_query_result(self):
   167      return self._query_with_retry()
   168  
   169    def describe_to(self, description):
   170      description \
   171        .append_text("Expected data is ") \
   172        .append_text(self.expected_data)
   173  
   174    def describe_mismatch(self, pipeline_result, mismatch_description):
   175      mismatch_description \
   176        .append_text("Actual data is ") \
   177        .append_text(self.actual_data)
   178  
   179  
   180  class BigqueryFullResultStreamingMatcher(BigqueryFullResultMatcher):
   181    """
   182    Matcher that verifies Bigquery data with given query.
   183  
   184    Fetch Bigquery data with given query, compare to the expected data.
   185    This matcher polls BigQuery until the no. of records in BigQuery is
   186    equal to the no. of records in expected data.
   187    A timeout can be specified.
   188    """
   189  
   190    DEFAULT_TIMEOUT = 5 * 60
   191  
   192    def __init__(self, project, query, data, timeout=DEFAULT_TIMEOUT):
   193      super().__init__(project, query, data)
   194      self.timeout = timeout
   195  
   196    def _get_query_result(self):
   197      start_time = time.time()
   198      while time.time() - start_time <= self.timeout:
   199        response = self._query_with_retry()
   200        if len(response) >= len(self.expected_data):
   201          return response
   202        _LOGGER.debug('Query result contains %d rows' % len(response))
   203        time.sleep(1)
   204      raise TimeoutError('Timeout exceeded for matcher.')  # noqa: F821
   205  
   206  
   207  class BigQueryTableMatcher(BaseMatcher):
   208    """Matcher that verifies the properties of a Table in BigQuery."""
   209    def __init__(self, project, dataset, table, expected_properties):
   210      if bigquery is None:
   211        raise ImportError('Bigquery dependencies are not installed.')
   212  
   213      self.project = project
   214      self.dataset = dataset
   215      self.table = table
   216      self.expected_properties = expected_properties
   217  
   218    @retry.with_exponential_backoff(
   219        num_retries=MAX_RETRIES,
   220        retry_filter=retry_on_http_timeout_and_value_error)
   221    def _get_table_with_retry(self, bigquery_wrapper):
   222      return bigquery_wrapper.get_table(self.project, self.dataset, self.table)
   223  
   224    def _matches(self, _):
   225      _LOGGER.info('Start verify Bigquery table properties.')
   226      # Run query
   227      bigquery_wrapper = bigquery_tools.BigQueryWrapper()
   228  
   229      self.actual_table = self._get_table_with_retry(bigquery_wrapper)
   230  
   231      _LOGGER.info('Table proto is %s', self.actual_table)
   232  
   233      return all(
   234          self._match_property(v, self._get_or_none(self.actual_table, k)) for k,
   235          v in self.expected_properties.items())
   236  
   237    @staticmethod
   238    def _get_or_none(obj, attr):
   239      try:
   240        return obj.__getattribute__(attr)
   241      except AttributeError:
   242        try:
   243          return obj.get(attr, None)
   244        except TypeError:
   245          return None
   246  
   247    @staticmethod
   248    def _match_property(expected, actual):
   249      _LOGGER.info("Matching %s to %s", expected, actual)
   250      if isinstance(expected, dict):
   251        return all(
   252            BigQueryTableMatcher._match_property(
   253                v, BigQueryTableMatcher._get_or_none(actual, k)) for k,
   254            v in expected.items())
   255      else:
   256        return expected == actual
   257  
   258    def describe_to(self, description):
   259      description \
   260        .append_text("Expected table attributes are ") \
   261        .append_text(sorted((k, v)
   262                            for k, v in self.expected_properties.items()))
   263  
   264    def describe_mismatch(self, pipeline_result, mismatch_description):
   265      mismatch_description \
   266        .append_text("Actual table attributes are ") \
   267        .append_text(sorted((k, self._get_or_none(self.actual_table, k))
   268                            for k in self.expected_properties))