github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/external/xlang_bigqueryio_it_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Unit tests for cross-language BigQuery sources and sinks.""" 19 # pytype: skip-file 20 21 import datetime 22 import logging 23 import os 24 import secrets 25 import time 26 import unittest 27 from decimal import Decimal 28 29 import pytest 30 from hamcrest.core import assert_that as hamcrest_assert 31 32 import apache_beam as beam 33 from apache_beam.io.external.generate_sequence import GenerateSequence 34 from apache_beam.io.gcp.bigquery import StorageWriteToBigQuery 35 from apache_beam.io.gcp.bigquery_tools import BigQueryWrapper 36 from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultMatcher 37 from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultStreamingMatcher 38 from apache_beam.testing.test_pipeline import TestPipeline 39 from apache_beam.utils.timestamp import Timestamp 40 41 # Protect against environments where bigquery library is not available. 42 # pylint: disable=wrong-import-order, wrong-import-position 43 44 try: 45 from apitools.base.py.exceptions import HttpError 46 except ImportError: 47 HttpError = None 48 # pylint: enable=wrong-import-order, wrong-import-position 49 50 _LOGGER = logging.getLogger(__name__) 51 52 53 @pytest.mark.uses_gcp_java_expansion_service 54 @unittest.skipUnless( 55 os.environ.get('EXPANSION_PORT'), 56 "EXPANSION_PORT environment var is not provided.") 57 class BigQueryXlangStorageWriteIT(unittest.TestCase): 58 BIGQUERY_DATASET = 'python_xlang_storage_write' 59 60 ELEMENTS = [ 61 # (int, float, numeric, string, bool, bytes, timestamp) 62 { 63 "int": 1, 64 "float": 0.1, 65 "numeric": Decimal("1.11"), 66 "str": "a", 67 "bool": True, 68 "bytes": b'a', 69 "timestamp": Timestamp(1000, 100) 70 }, 71 { 72 "int": 2, 73 "float": 0.2, 74 "numeric": Decimal("2.22"), 75 "str": "b", 76 "bool": False, 77 "bytes": b'b', 78 "timestamp": Timestamp(2000, 200) 79 }, 80 { 81 "int": 3, 82 "float": 0.3, 83 "numeric": Decimal("3.33"), 84 "str": "c", 85 "bool": True, 86 "bytes": b'd', 87 "timestamp": Timestamp(3000, 300) 88 }, 89 { 90 "int": 4, 91 "float": 0.4, 92 "numeric": Decimal("4.44"), 93 "str": "d", 94 "bool": False, 95 "bytes": b'd', 96 "timestamp": Timestamp(4000, 400) 97 } 98 ] 99 ALL_TYPES_SCHEMA = ( 100 "int:INTEGER,float:FLOAT,numeric:NUMERIC,str:STRING," 101 "bool:BOOLEAN,bytes:BYTES,timestamp:TIMESTAMP") 102 103 def setUp(self): 104 self.test_pipeline = TestPipeline(is_integration_test=True) 105 self.args = self.test_pipeline.get_full_options_as_args() 106 self.project = self.test_pipeline.get_option('project') 107 108 self.bigquery_client = BigQueryWrapper() 109 self.dataset_id = '%s_%s_%s' % ( 110 self.BIGQUERY_DATASET, str(int(time.time())), secrets.token_hex(3)) 111 self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) 112 _LOGGER.info( 113 "Created dataset %s in project %s", self.dataset_id, self.project) 114 115 _LOGGER.info("expansion port: %s", os.environ.get('EXPANSION_PORT')) 116 self.expansion_service = ('localhost:%s' % os.environ.get('EXPANSION_PORT')) 117 118 def tearDown(self): 119 try: 120 _LOGGER.info( 121 "Deleting dataset %s in project %s", self.dataset_id, self.project) 122 self.bigquery_client._delete_dataset( 123 project_id=self.project, 124 dataset_id=self.dataset_id, 125 delete_contents=True) 126 except HttpError: 127 _LOGGER.debug( 128 'Failed to clean up dataset %s in project %s', 129 self.dataset_id, 130 self.project) 131 132 def parse_expected_data(self, expected_elements): 133 data = [] 134 for row in expected_elements: 135 values = list(row.values()) 136 for i, val in enumerate(values): 137 if isinstance(val, Timestamp): 138 # BigQuery matcher query returns a datetime.datetime object 139 values[i] = val.to_utc_datetime().replace( 140 tzinfo=datetime.timezone.utc) 141 data.append(tuple(values)) 142 143 return data 144 145 def run_storage_write_test( 146 self, table_name, items, schema, use_at_least_once=False): 147 table_id = '{}:{}.{}'.format(self.project, self.dataset_id, table_name) 148 149 bq_matcher = BigqueryFullResultMatcher( 150 project=self.project, 151 query="SELECT * FROM %s" % '{}.{}'.format(self.dataset_id, table_name), 152 data=self.parse_expected_data(items)) 153 154 with beam.Pipeline(argv=self.args) as p: 155 _ = ( 156 p 157 | beam.Create(items) 158 | beam.io.WriteToBigQuery( 159 table=table_id, 160 method=beam.io.WriteToBigQuery.Method.STORAGE_WRITE_API, 161 schema=schema, 162 use_at_least_once=use_at_least_once, 163 expansion_service=self.expansion_service)) 164 hamcrest_assert(p, bq_matcher) 165 166 def test_all_types(self): 167 table_name = "all_types" 168 schema = self.ALL_TYPES_SCHEMA 169 self.run_storage_write_test(table_name, self.ELEMENTS, schema) 170 171 def test_with_at_least_once_semantics(self): 172 table_name = "with_at_least_once_semantics" 173 schema = self.ALL_TYPES_SCHEMA 174 self.run_storage_write_test( 175 table_name, self.ELEMENTS, schema, use_at_least_once=True) 176 177 def test_nested_records_and_lists(self): 178 table_name = "nested_records_and_lists" 179 schema = { 180 "fields": [{ 181 "name": "repeated_int", "type": "INTEGER", "mode": "REPEATED" 182 }, 183 { 184 "name": "struct", 185 "type": "STRUCT", 186 "fields": [{ 187 "name": "nested_int", "type": "INTEGER" 188 }, { 189 "name": "nested_str", "type": "STRING" 190 }] 191 }, 192 { 193 "name": "repeated_struct", 194 "type": "STRUCT", 195 "mode": "REPEATED", 196 "fields": [{ 197 "name": "nested_numeric", "type": "NUMERIC" 198 }, { 199 "name": "nested_bytes", "type": "BYTES" 200 }] 201 }] 202 } 203 items = [{ 204 "repeated_int": [1, 2, 3], 205 "struct": { 206 "nested_int": 1, "nested_str": "a" 207 }, 208 "repeated_struct": [{ 209 "nested_numeric": Decimal("1.23"), "nested_bytes": b'a' 210 }, 211 { 212 "nested_numeric": Decimal("3.21"), 213 "nested_bytes": b'aa' 214 }] 215 }] 216 217 self.run_storage_write_test(table_name, items, schema) 218 219 def test_write_with_beam_rows(self): 220 table = 'write_with_beam_rows' 221 table_id = '{}:{}.{}'.format(self.project, self.dataset_id, table) 222 223 row_elements = [ 224 beam.Row( 225 my_int=e['int'], 226 my_float=e['float'], 227 my_numeric=e['numeric'], 228 my_string=e['str'], 229 my_bool=e['bool'], 230 my_bytes=e['bytes'], 231 my_timestamp=e['timestamp']) for e in self.ELEMENTS 232 ] 233 234 bq_matcher = BigqueryFullResultMatcher( 235 project=self.project, 236 query="SELECT * FROM {}.{}".format(self.dataset_id, table), 237 data=self.parse_expected_data(self.ELEMENTS)) 238 239 with beam.Pipeline(argv=self.args) as p: 240 _ = ( 241 p 242 | beam.Create(row_elements) 243 | StorageWriteToBigQuery( 244 table=table_id, expansion_service=self.expansion_service)) 245 hamcrest_assert(p, bq_matcher) 246 247 def run_streaming( 248 self, table_name, auto_sharding=False, use_at_least_once=False): 249 elements = self.ELEMENTS.copy() 250 schema = self.ALL_TYPES_SCHEMA 251 table_id = '{}:{}.{}'.format(self.project, self.dataset_id, table_name) 252 253 bq_matcher = BigqueryFullResultStreamingMatcher( 254 project=self.project, 255 query="SELECT * FROM {}.{}".format(self.dataset_id, table_name), 256 data=self.parse_expected_data(self.ELEMENTS)) 257 258 args = self.test_pipeline.get_full_options_as_args( 259 on_success_matcher=bq_matcher, 260 streaming=True, 261 allow_unsafe_triggers=True) 262 263 with beam.Pipeline(argv=args) as p: 264 _ = ( 265 p 266 | GenerateSequence( 267 start=0, stop=4, expansion_service=self.expansion_service) 268 | beam.Map(lambda x: elements[x]) 269 | beam.io.WriteToBigQuery( 270 table=table_id, 271 method=beam.io.WriteToBigQuery.Method.STORAGE_WRITE_API, 272 schema=schema, 273 with_auto_sharding=auto_sharding, 274 use_at_least_once=use_at_least_once, 275 expansion_service=self.expansion_service)) 276 hamcrest_assert(p, bq_matcher) 277 278 def test_streaming(self): 279 table = 'streaming' 280 self.run_streaming(table_name=table) 281 282 def test_streaming_with_at_least_once(self): 283 table = 'streaming' 284 self.run_streaming(table_name=table, use_at_least_once=True) 285 286 def test_streaming_with_auto_sharding(self): 287 table = 'streaming_with_auto_sharding' 288 self.run_streaming(table_name=table, auto_sharding=True) 289 290 291 if __name__ == '__main__': 292 logging.getLogger().setLevel(logging.INFO) 293 unittest.main()