github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_write_it_test.py (about) 1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # 4 # Licensed to the Apache Software Foundation (ASF) under one or more 5 # contributor license agreements. See the NOTICE file distributed with 6 # this work for additional information regarding copyright ownership. 7 # The ASF licenses this file to You under the Apache License, Version 2.0 8 # (the "License"); you may not use this file except in compliance with 9 # the License. You may obtain a copy of the License at 10 # 11 # http://www.apache.org/licenses/LICENSE-2.0 12 # 13 # Unless required by applicable law or agreed to in writing, software 14 # distributed under the License is distributed on an "AS IS" BASIS, 15 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 # See the License for the specific language governing permissions and 17 # limitations under the License. 18 # 19 20 """Unit tests for BigQuery sources and sinks.""" 21 # pytype: skip-file 22 23 import base64 24 import datetime 25 import logging 26 import secrets 27 import time 28 import unittest 29 from decimal import Decimal 30 31 import hamcrest as hc 32 import mock 33 import pytest 34 import pytz 35 from parameterized import param 36 from parameterized import parameterized 37 38 import apache_beam as beam 39 from apache_beam.io.gcp.bigquery import BigQueryWriteFn 40 from apache_beam.io.gcp.bigquery_tools import BigQueryWrapper 41 from apache_beam.io.gcp.bigquery_tools import FileFormat 42 from apache_beam.io.gcp.internal.clients import bigquery 43 from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultMatcher 44 from apache_beam.testing.test_pipeline import TestPipeline 45 from apache_beam.testing.util import assert_that 46 from apache_beam.testing.util import equal_to 47 48 # Protect against environments where bigquery library is not available. 49 # pylint: disable=wrong-import-order, wrong-import-position 50 51 try: 52 from apitools.base.py.exceptions import HttpError 53 except ImportError: 54 HttpError = None 55 # pylint: enable=wrong-import-order, wrong-import-position 56 57 _LOGGER = logging.getLogger(__name__) 58 59 60 class BigQueryWriteIntegrationTests(unittest.TestCase): 61 BIG_QUERY_DATASET_ID = 'python_write_to_table_' 62 63 def setUp(self): 64 self.test_pipeline = TestPipeline(is_integration_test=True) 65 self.runner_name = type(self.test_pipeline.runner).__name__ 66 self.project = self.test_pipeline.get_option('project') 67 68 self.bigquery_client = BigQueryWrapper() 69 self.dataset_id = '%s%d%s' % ( 70 self.BIG_QUERY_DATASET_ID, int(time.time()), secrets.token_hex(3)) 71 self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) 72 _LOGGER.info( 73 "Created dataset %s in project %s", self.dataset_id, self.project) 74 75 def tearDown(self): 76 request = bigquery.BigqueryDatasetsDeleteRequest( 77 projectId=self.project, datasetId=self.dataset_id, deleteContents=True) 78 try: 79 _LOGGER.info( 80 "Deleting dataset %s in project %s", self.dataset_id, self.project) 81 self.bigquery_client.client.datasets.Delete(request) 82 except HttpError: 83 _LOGGER.debug( 84 'Failed to clean up dataset %s in project %s', 85 self.dataset_id, 86 self.project) 87 88 def create_table(self, table_name): 89 table_schema = bigquery.TableSchema() 90 table_field = bigquery.TableFieldSchema() 91 table_field.name = 'int64' 92 table_field.type = 'INT64' 93 table_field.mode = 'REQUIRED' 94 table_schema.fields.append(table_field) 95 table_field = bigquery.TableFieldSchema() 96 table_field.name = 'bytes' 97 table_field.type = 'BYTES' 98 table_schema.fields.append(table_field) 99 table_field = bigquery.TableFieldSchema() 100 table_field.name = 'date' 101 table_field.type = 'DATE' 102 table_schema.fields.append(table_field) 103 table_field = bigquery.TableFieldSchema() 104 table_field.name = 'time' 105 table_field.type = 'TIME' 106 table_schema.fields.append(table_field) 107 table = bigquery.Table( 108 tableReference=bigquery.TableReference( 109 projectId=self.project, 110 datasetId=self.dataset_id, 111 tableId=table_name), 112 schema=table_schema) 113 request = bigquery.BigqueryTablesInsertRequest( 114 projectId=self.project, datasetId=self.dataset_id, table=table) 115 self.bigquery_client.client.tables.Insert(request) 116 117 @pytest.mark.it_postcommit 118 def test_big_query_write(self): 119 table_name = 'python_write_table' 120 table_id = '{}.{}'.format(self.dataset_id, table_name) 121 122 input_data = [ 123 { 124 'number': 1, 'str': 'abc' 125 }, 126 { 127 'number': 2, 'str': 'def' 128 }, 129 { 130 'number': 3, 'str': u'你好' 131 }, 132 { 133 'number': 4, 'str': u'привет' 134 }, 135 ] 136 table_schema = { 137 "fields": [{ 138 "name": "number", "type": "INTEGER" 139 }, { 140 "name": "str", "type": "STRING" 141 }] 142 } 143 144 pipeline_verifiers = [ 145 BigqueryFullResultMatcher( 146 project=self.project, 147 query="SELECT number, str FROM %s" % table_id, 148 data=[( 149 1, 150 'abc', 151 ), ( 152 2, 153 'def', 154 ), ( 155 3, 156 u'你好', 157 ), ( 158 4, 159 u'привет', 160 )]) 161 ] 162 163 args = self.test_pipeline.get_full_options_as_args( 164 on_success_matcher=hc.all_of(*pipeline_verifiers)) 165 166 with beam.Pipeline(argv=args) as p: 167 # pylint: disable=expression-not-assigned 168 ( 169 p | 'create' >> beam.Create(input_data) 170 | 'write' >> beam.io.WriteToBigQuery( 171 table_id, 172 schema=table_schema, 173 create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 174 write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) 175 176 @pytest.mark.it_postcommit 177 def test_big_query_write_schema_autodetect(self): 178 if self.runner_name == 'TestDataflowRunner': 179 self.skipTest('DataflowRunner does not support schema autodetection') 180 181 table_name = 'python_write_table' 182 table_id = '{}.{}'.format(self.dataset_id, table_name) 183 184 input_data = [ 185 { 186 'number': 1, 'str': 'abc' 187 }, 188 { 189 'number': 2, 'str': 'def' 190 }, 191 ] 192 193 pipeline_verifiers = [ 194 BigqueryFullResultMatcher( 195 project=self.project, 196 query="SELECT number, str FROM %s" % table_id, 197 data=[( 198 1, 199 'abc', 200 ), ( 201 2, 202 'def', 203 )]) 204 ] 205 206 args = self.test_pipeline.get_full_options_as_args( 207 on_success_matcher=hc.all_of(*pipeline_verifiers)) 208 209 with beam.Pipeline(argv=args) as p: 210 # pylint: disable=expression-not-assigned 211 ( 212 p | 'create' >> beam.Create(input_data) 213 | 'write' >> beam.io.WriteToBigQuery( 214 table_id, 215 method=beam.io.WriteToBigQuery.Method.FILE_LOADS, 216 schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT, 217 create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 218 write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, 219 temp_file_format=FileFormat.JSON)) 220 221 @pytest.mark.it_postcommit 222 def test_big_query_write_new_types(self): 223 table_name = 'python_new_types_table' 224 table_id = '{}.{}'.format(self.dataset_id, table_name) 225 226 row_data = { 227 'float': 0.33, 228 'numeric': Decimal('10'), 229 'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'), 230 'date': '3000-12-31', 231 'time': '23:59:59', 232 'datetime': '2018-12-31T12:44:31', 233 'timestamp': '2018-12-31 12:44:31.744957 UTC', 234 'geo': 'POINT(30 10)' 235 } 236 237 input_data = [row_data] 238 # add rows with only one key value pair and None values for all other keys 239 for key, value in row_data.items(): 240 input_data.append({key: value}) 241 242 table_schema = { 243 "fields": [{ 244 "name": "float", "type": "FLOAT" 245 }, { 246 "name": "numeric", "type": "NUMERIC" 247 }, { 248 "name": "bytes", "type": "BYTES" 249 }, { 250 "name": "date", "type": "DATE" 251 }, { 252 "name": "time", "type": "TIME" 253 }, { 254 "name": "datetime", "type": "DATETIME" 255 }, { 256 "name": "timestamp", "type": "TIMESTAMP" 257 }, { 258 "name": "geo", "type": "GEOGRAPHY" 259 }] 260 } 261 262 expected_row = ( 263 0.33, 264 Decimal('10'), 265 b'\xab\xac', 266 datetime.date(3000, 12, 31), 267 datetime.time(23, 59, 59), 268 datetime.datetime(2018, 12, 31, 12, 44, 31), 269 datetime.datetime(2018, 12, 31, 12, 44, 31, 744957, tzinfo=pytz.utc), 270 'POINT(30 10)', 271 ) 272 273 expected_data = [expected_row] 274 275 # add rows with only one key value pair and None values for all other keys 276 for i, value in enumerate(expected_row): 277 row = [None] * len(expected_row) 278 row[i] = value 279 expected_data.append(tuple(row)) 280 281 pipeline_verifiers = [ 282 BigqueryFullResultMatcher( 283 project=self.project, 284 query='SELECT float, numeric, bytes, date, time, datetime,' 285 'timestamp, geo FROM %s' % table_id, 286 data=expected_data) 287 ] 288 289 args = self.test_pipeline.get_full_options_as_args( 290 on_success_matcher=hc.all_of(*pipeline_verifiers)) 291 292 with beam.Pipeline(argv=args) as p: 293 # pylint: disable=expression-not-assigned 294 ( 295 p | 'create' >> beam.Create(input_data) 296 | 'write' >> beam.io.WriteToBigQuery( 297 table_id, 298 schema=table_schema, 299 create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 300 write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) 301 302 @pytest.mark.it_postcommit 303 def test_big_query_write_without_schema(self): 304 table_name = 'python_no_schema_table' 305 self.create_table(table_name) 306 table_id = '{}.{}'.format(self.dataset_id, table_name) 307 308 input_data = [{ 309 'int64': 1, 310 'bytes': b'xyw', 311 'date': '2011-01-01', 312 'time': '23:59:59.999999' 313 }, 314 { 315 'int64': 2, 316 'bytes': b'abc', 317 'date': '2000-01-01', 318 'time': '00:00:00' 319 }, 320 { 321 'int64': 3, 322 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 323 'date': '3000-12-31', 324 'time': '23:59:59' 325 }, 326 { 327 'int64': 4, 328 'bytes': b'\xab\xac\xad', 329 'date': '2000-01-01', 330 'time': '00:00:00' 331 }] 332 # bigquery io expects bytes to be base64 encoded values 333 for row in input_data: 334 row['bytes'] = base64.b64encode(row['bytes']) 335 336 pipeline_verifiers = [ 337 BigqueryFullResultMatcher( 338 project=self.project, 339 query="SELECT int64, bytes, date, time FROM %s" % table_id, 340 data=[( 341 1, 342 b'xyw', 343 datetime.date(2011, 1, 1), 344 datetime.time(23, 59, 59, 999999), 345 ), 346 ( 347 2, 348 b'abc', 349 datetime.date(2000, 1, 1), 350 datetime.time(0, 0, 0), 351 ), 352 ( 353 3, 354 b'\xe4\xbd\xa0\xe5\xa5\xbd', 355 datetime.date(3000, 12, 31), 356 datetime.time(23, 59, 59), 357 ), 358 ( 359 4, 360 b'\xab\xac\xad', 361 datetime.date(2000, 1, 1), 362 datetime.time(0, 0, 0), 363 )]) 364 ] 365 366 args = self.test_pipeline.get_full_options_as_args( 367 on_success_matcher=hc.all_of(*pipeline_verifiers)) 368 369 with beam.Pipeline(argv=args) as p: 370 # pylint: disable=expression-not-assigned 371 ( 372 p | 'create' >> beam.Create(input_data) 373 | 'write' >> beam.io.WriteToBigQuery( 374 table_id, 375 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, 376 temp_file_format=FileFormat.JSON)) 377 378 @pytest.mark.it_postcommit 379 def test_big_query_write_insert_errors_reporting(self): 380 """ 381 Test that errors returned by beam.io.WriteToBigQuery 382 contain both the failed rows amd the reason for it failing. 383 """ 384 table_name = 'python_write_table' 385 table_id = '{}.{}'.format(self.dataset_id, table_name) 386 387 input_data = [{ 388 'number': 1, 389 'str': 'some_string', 390 }, { 391 'number': 2 392 }, 393 { 394 'number': 3, 395 'str': 'some_string', 396 'additional_field_str': 'some_string', 397 }] 398 399 table_schema = { 400 "fields": [{ 401 "name": "number", "type": "INTEGER", 'mode': 'REQUIRED' 402 }, { 403 "name": "str", "type": "STRING", 'mode': 'REQUIRED' 404 }] 405 } 406 407 bq_result_errors = [( 408 { 409 "number": 2 410 }, 411 [{ 412 "reason": "invalid", 413 "location": "", 414 "debugInfo": "", 415 "message": "Missing required field: Msg_0_CLOUD_QUERY_TABLE.str." 416 }], 417 ), 418 ({ 419 "number": 3, 420 "str": "some_string", 421 "additional_field_str": "some_string" 422 }, 423 [{ 424 "reason": "invalid", 425 "location": "additional_field_str", 426 "debugInfo": "", 427 "message": "no such field: additional_field_str." 428 }])] 429 430 pipeline_verifiers = [ 431 BigqueryFullResultMatcher( 432 project=self.project, 433 query="SELECT number, str FROM %s" % table_id, 434 data=[(1, 'some_string')]), 435 ] 436 437 args = self.test_pipeline.get_full_options_as_args( 438 on_success_matcher=hc.all_of(*pipeline_verifiers)) 439 440 with beam.Pipeline(argv=args) as p: 441 # pylint: disable=expression-not-assigned 442 errors = ( 443 p | 'create' >> beam.Create(input_data) 444 | 'write' >> beam.io.WriteToBigQuery( 445 table_id, 446 schema=table_schema, 447 method='STREAMING_INSERTS', 448 insert_retry_strategy='RETRY_NEVER', 449 create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 450 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) 451 452 assert_that( 453 errors[BigQueryWriteFn.FAILED_ROWS_WITH_ERRORS] 454 | 'ParseErrors' >> beam.Map(lambda err: (err[1], err[2])), 455 equal_to(bq_result_errors)) 456 457 @pytest.mark.it_postcommit 458 @parameterized.expand([ 459 param(file_format=FileFormat.AVRO), 460 param(file_format=FileFormat.JSON), 461 param(file_format=None), 462 ]) 463 @mock.patch( 464 "apache_beam.io.gcp.bigquery_file_loads._MAXIMUM_SOURCE_URIS", new=1) 465 def test_big_query_write_temp_table_append_schema_update(self, file_format): 466 """ 467 Test that nested schema update options and schema relaxation 468 are respected when appending to an existing table via temporary tables. 469 470 _MAXIMUM_SOURCE_URIS and max_file_size are both set to 1 to force multiple 471 load jobs and usage of temporary tables. 472 """ 473 table_name = 'python_append_schema_update' 474 self.create_table(table_name) 475 table_id = '{}.{}'.format(self.dataset_id, table_name) 476 477 # bytes, date, time fields are optional and omitted in the test 478 # only required and new columns are specified 479 table_schema = { 480 "fields": [{ 481 "name": "int64", 482 "type": "INT64", 483 "mode": "NULLABLE", 484 }, { 485 "name": "bool", 486 "type": "BOOL", 487 }, 488 { 489 "name": "nested_field", 490 "type": "RECORD", 491 "mode": "REPEATED", 492 "fields": [ 493 { 494 "name": "fruit", 495 "type": "STRING", 496 "mode": "NULLABLE" 497 }, 498 ] 499 }] 500 } 501 input_data = [{ 502 "int64": 1, "bool": True, "nested_field": [{ 503 "fruit": "Apple" 504 }] 505 }, { 506 "bool": False, "nested_field": [{ 507 "fruit": "Mango" 508 }] 509 }, 510 { 511 "int64": None, 512 "bool": True, 513 "nested_field": [{ 514 "fruit": "Banana" 515 }] 516 }] 517 args = self.test_pipeline.get_full_options_as_args( 518 on_success_matcher=BigqueryFullResultMatcher( 519 project=self.project, 520 query=""" 521 SELECT bytes, date, time, int64, bool, fruit 522 FROM {}, 523 UNNEST(nested_field) as nested_field 524 ORDER BY fruit 525 """.format(table_id), 526 data=[(None, None, None, 1, True, 527 "Apple"), (None, None, None, None, True, "Banana"), ( 528 None, None, None, None, False, "Mango")])) 529 530 with beam.Pipeline(argv=args) as p: 531 # pylint: disable=expression-not-assigned 532 ( 533 p | 'create' >> beam.Create(input_data) 534 | 'write' >> beam.io.WriteToBigQuery( 535 table_id, 536 schema=table_schema, 537 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, 538 max_file_size=1, # bytes 539 method=beam.io.WriteToBigQuery.Method.FILE_LOADS, 540 additional_bq_parameters={ 541 'schemaUpdateOptions': ['ALLOW_FIELD_ADDITION', 542 'ALLOW_FIELD_RELAXATION']}, 543 temp_file_format=file_format)) 544 545 546 if __name__ == '__main__': 547 logging.getLogger().setLevel(logging.INFO) 548 unittest.main()