github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/tests/utils.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Utility methods for testing on GCP."""
    19  
    20  # pytype: skip-file
    21  
    22  import logging
    23  import secrets
    24  import time
    25  
    26  from apache_beam.io import filesystems
    27  from apache_beam.io.gcp.pubsub import PubsubMessage
    28  from apache_beam.utils import retry
    29  
    30  # Protect against environments where bigquery library is not available.
    31  try:
    32    from google.api_core import exceptions as gexc
    33    from google.cloud import bigquery
    34  except ImportError:
    35    gexc = None
    36    bigquery = None
    37  
    38  _LOGGER = logging.getLogger(__name__)
    39  
    40  
    41  class GcpTestIOError(retry.PermanentException):
    42    """Basic GCP IO error for testing. Function that raises this error should
    43    not be retried."""
    44    pass
    45  
    46  
    47  @retry.with_exponential_backoff(
    48      num_retries=3, retry_filter=retry.retry_on_server_errors_filter)
    49  def create_bq_dataset(project, dataset_base_name):
    50    """Creates an empty BigQuery dataset.
    51  
    52    Args:
    53      project: Project to work in.
    54      dataset_base_name: Prefix for dataset id.
    55  
    56    Returns:
    57      A ``google.cloud.bigquery.dataset.DatasetReference`` object pointing to the
    58      new dataset.
    59    """
    60    client = bigquery.Client(project=project)
    61    unique_dataset_name = '%s%d%s' % (
    62        dataset_base_name, int(time.time()), secrets.token_hex(3))
    63    dataset_ref = client.dataset(unique_dataset_name, project=project)
    64    dataset = bigquery.Dataset(dataset_ref)
    65    client.create_dataset(dataset)
    66    return dataset_ref
    67  
    68  
    69  @retry.with_exponential_backoff(
    70      num_retries=3, retry_filter=retry.retry_on_server_errors_filter)
    71  def delete_bq_dataset(project, dataset_ref):
    72    """Deletes a BigQuery dataset and its contents.
    73  
    74    Args:
    75      project: Project to work in.
    76      dataset_ref: A ``google.cloud.bigquery.dataset.DatasetReference`` object
    77        pointing to the dataset to delete.
    78    """
    79    client = bigquery.Client(project=project)
    80    client.delete_dataset(dataset_ref, delete_contents=True)
    81  
    82  
    83  @retry.with_exponential_backoff(
    84      num_retries=3, retry_filter=retry.retry_on_server_errors_filter)
    85  def delete_bq_table(project, dataset_id, table_id):
    86    """Delete a BiqQuery table.
    87  
    88    Args:
    89      project: Name of the project.
    90      dataset_id: Name of the dataset where table is.
    91      table_id: Name of the table.
    92    """
    93    _LOGGER.info(
    94        'Clean up a BigQuery table with project: %s, dataset: %s, '
    95        'table: %s.',
    96        project,
    97        dataset_id,
    98        table_id)
    99    client = bigquery.Client(project=project)
   100    table_ref = client.dataset(dataset_id).table(table_id)
   101    try:
   102      client.delete_table(table_ref)
   103    except gexc.NotFound:
   104      raise GcpTestIOError('BigQuery table does not exist: %s' % table_ref)
   105  
   106  
   107  @retry.with_exponential_backoff(
   108      num_retries=3, retry_filter=retry.retry_on_server_errors_filter)
   109  def delete_directory(directory):
   110    """Delete a directory in a filesystem.
   111  
   112    Args:
   113      directory: Full path to a directory supported by Beam filesystems (e.g.
   114        "gs://mybucket/mydir/", "s3://...", ...)
   115    """
   116    filesystems.FileSystems.delete([directory])
   117  
   118  
   119  def write_to_pubsub(
   120      pub_client,
   121      topic_path,
   122      messages,
   123      with_attributes=False,
   124      chunk_size=100,
   125      delay_between_chunks=0.1):
   126    for start in range(0, len(messages), chunk_size):
   127      message_chunk = messages[start:start + chunk_size]
   128      if with_attributes:
   129        futures = [
   130            pub_client.publish(topic_path, message.data, **message.attributes)
   131            for message in message_chunk
   132        ]
   133      else:
   134        futures = [
   135            pub_client.publish(topic_path, message) for message in message_chunk
   136        ]
   137      for future in futures:
   138        future.result()
   139      time.sleep(delay_between_chunks)
   140  
   141  
   142  def read_from_pubsub(
   143      sub_client,
   144      subscription_path,
   145      with_attributes=False,
   146      number_of_elements=None,
   147      timeout=None):
   148    if number_of_elements is None and timeout is None:
   149      raise ValueError("Either number_of_elements or timeout must be specified.")
   150    messages = []
   151    start_time = time.time()
   152  
   153    while ((number_of_elements is None or len(messages) < number_of_elements) and
   154           (timeout is None or (time.time() - start_time) < timeout)):
   155      try:
   156        response = sub_client.pull(
   157            subscription_path, max_messages=1000, retry=None, timeout=10)
   158      except (gexc.RetryError, gexc.DeadlineExceeded):
   159        continue
   160      ack_ids = [msg.ack_id for msg in response.received_messages]
   161      sub_client.acknowledge(subscription=subscription_path, ack_ids=ack_ids)
   162      for msg in response.received_messages:
   163        message = PubsubMessage._from_message(msg.message)
   164        if with_attributes:
   165          messages.append(message)
   166        else:
   167          messages.append(message.data)
   168    return messages