github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/tests/utils.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Utility methods for testing on GCP.""" 19 20 # pytype: skip-file 21 22 import logging 23 import secrets 24 import time 25 26 from apache_beam.io import filesystems 27 from apache_beam.io.gcp.pubsub import PubsubMessage 28 from apache_beam.utils import retry 29 30 # Protect against environments where bigquery library is not available. 31 try: 32 from google.api_core import exceptions as gexc 33 from google.cloud import bigquery 34 except ImportError: 35 gexc = None 36 bigquery = None 37 38 _LOGGER = logging.getLogger(__name__) 39 40 41 class GcpTestIOError(retry.PermanentException): 42 """Basic GCP IO error for testing. Function that raises this error should 43 not be retried.""" 44 pass 45 46 47 @retry.with_exponential_backoff( 48 num_retries=3, retry_filter=retry.retry_on_server_errors_filter) 49 def create_bq_dataset(project, dataset_base_name): 50 """Creates an empty BigQuery dataset. 51 52 Args: 53 project: Project to work in. 54 dataset_base_name: Prefix for dataset id. 55 56 Returns: 57 A ``google.cloud.bigquery.dataset.DatasetReference`` object pointing to the 58 new dataset. 59 """ 60 client = bigquery.Client(project=project) 61 unique_dataset_name = '%s%d%s' % ( 62 dataset_base_name, int(time.time()), secrets.token_hex(3)) 63 dataset_ref = client.dataset(unique_dataset_name, project=project) 64 dataset = bigquery.Dataset(dataset_ref) 65 client.create_dataset(dataset) 66 return dataset_ref 67 68 69 @retry.with_exponential_backoff( 70 num_retries=3, retry_filter=retry.retry_on_server_errors_filter) 71 def delete_bq_dataset(project, dataset_ref): 72 """Deletes a BigQuery dataset and its contents. 73 74 Args: 75 project: Project to work in. 76 dataset_ref: A ``google.cloud.bigquery.dataset.DatasetReference`` object 77 pointing to the dataset to delete. 78 """ 79 client = bigquery.Client(project=project) 80 client.delete_dataset(dataset_ref, delete_contents=True) 81 82 83 @retry.with_exponential_backoff( 84 num_retries=3, retry_filter=retry.retry_on_server_errors_filter) 85 def delete_bq_table(project, dataset_id, table_id): 86 """Delete a BiqQuery table. 87 88 Args: 89 project: Name of the project. 90 dataset_id: Name of the dataset where table is. 91 table_id: Name of the table. 92 """ 93 _LOGGER.info( 94 'Clean up a BigQuery table with project: %s, dataset: %s, ' 95 'table: %s.', 96 project, 97 dataset_id, 98 table_id) 99 client = bigquery.Client(project=project) 100 table_ref = client.dataset(dataset_id).table(table_id) 101 try: 102 client.delete_table(table_ref) 103 except gexc.NotFound: 104 raise GcpTestIOError('BigQuery table does not exist: %s' % table_ref) 105 106 107 @retry.with_exponential_backoff( 108 num_retries=3, retry_filter=retry.retry_on_server_errors_filter) 109 def delete_directory(directory): 110 """Delete a directory in a filesystem. 111 112 Args: 113 directory: Full path to a directory supported by Beam filesystems (e.g. 114 "gs://mybucket/mydir/", "s3://...", ...) 115 """ 116 filesystems.FileSystems.delete([directory]) 117 118 119 def write_to_pubsub( 120 pub_client, 121 topic_path, 122 messages, 123 with_attributes=False, 124 chunk_size=100, 125 delay_between_chunks=0.1): 126 for start in range(0, len(messages), chunk_size): 127 message_chunk = messages[start:start + chunk_size] 128 if with_attributes: 129 futures = [ 130 pub_client.publish(topic_path, message.data, **message.attributes) 131 for message in message_chunk 132 ] 133 else: 134 futures = [ 135 pub_client.publish(topic_path, message) for message in message_chunk 136 ] 137 for future in futures: 138 future.result() 139 time.sleep(delay_between_chunks) 140 141 142 def read_from_pubsub( 143 sub_client, 144 subscription_path, 145 with_attributes=False, 146 number_of_elements=None, 147 timeout=None): 148 if number_of_elements is None and timeout is None: 149 raise ValueError("Either number_of_elements or timeout must be specified.") 150 messages = [] 151 start_time = time.time() 152 153 while ((number_of_elements is None or len(messages) < number_of_elements) and 154 (timeout is None or (time.time() - start_time) < timeout)): 155 try: 156 response = sub_client.pull( 157 subscription_path, max_messages=1000, retry=None, timeout=10) 158 except (gexc.RetryError, gexc.DeadlineExceeded): 159 continue 160 ack_ids = [msg.ack_id for msg in response.received_messages] 161 sub_client.acknowledge(subscription=subscription_path, ack_ids=ack_ids) 162 for msg in response.received_messages: 163 message = PubsubMessage._from_message(msg.message) 164 if with_attributes: 165 messages.append(message) 166 else: 167 messages.append(message.data) 168 return messages