github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_read_it_test.py (about) 1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # 4 # Licensed to the Apache Software Foundation (ASF) under one or more 5 # contributor license agreements. See the NOTICE file distributed with 6 # this work for additional information regarding copyright ownership. 7 # The ASF licenses this file to You under the Apache License, Version 2.0 8 # (the "License"); you may not use this file except in compliance with 9 # the License. You may obtain a copy of the License at 10 # 11 # http://www.apache.org/licenses/LICENSE-2.0 12 # 13 # Unless required by applicable law or agreed to in writing, software 14 # distributed under the License is distributed on an "AS IS" BASIS, 15 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 # See the License for the specific language governing permissions and 17 # limitations under the License. 18 # 19 20 """Unit tests for BigQuery sources and sinks.""" 21 # pytype: skip-file 22 23 import base64 24 import datetime 25 import logging 26 import secrets 27 import time 28 import unittest 29 import uuid 30 from decimal import Decimal 31 from functools import wraps 32 33 import pytest 34 35 import apache_beam as beam 36 import apache_beam.io.gcp.bigquery 37 from apache_beam.io.gcp import bigquery_schema_tools 38 from apache_beam.io.gcp import bigquery_tools 39 from apache_beam.io.gcp.bigquery_tools import BigQueryWrapper 40 from apache_beam.io.gcp.internal.clients import bigquery 41 from apache_beam.options.value_provider import StaticValueProvider 42 from apache_beam.runners.interactive import interactive_beam 43 from apache_beam.runners.interactive.interactive_runner import InteractiveRunner 44 from apache_beam.testing.test_pipeline import TestPipeline 45 from apache_beam.testing.util import assert_that 46 from apache_beam.testing.util import equal_to 47 from apache_beam.utils.timestamp import Timestamp 48 49 # Protect against environments where bigquery library is not available. 50 # pylint: disable=wrong-import-order, wrong-import-position 51 try: 52 from apitools.base.py.exceptions import HttpError 53 except ImportError: 54 HttpError = None 55 # pylint: enable=wrong-import-order, wrong-import-position 56 57 _LOGGER = logging.getLogger(__name__) 58 59 60 def skip(runners): 61 if not isinstance(runners, list): 62 runners = [runners] 63 64 def inner(fn): 65 @wraps(fn) 66 def wrapped(self): 67 if self.runner_name in runners: 68 self.skipTest( 69 'This test doesn\'t work on these runners: {}'.format(runners)) 70 else: 71 return fn(self) 72 73 return wrapped 74 75 return inner 76 77 78 def datetime_to_utc(element): 79 for k, v in element.items(): 80 if isinstance(v, (datetime.time, datetime.date)): 81 element[k] = str(v) 82 if isinstance(v, datetime.datetime) and v.tzinfo: 83 # For datetime objects, we'll 84 offset = v.utcoffset() 85 utc_dt = (v - offset).strftime('%Y-%m-%d %H:%M:%S.%f UTC') 86 element[k] = utc_dt 87 return element 88 89 90 class BigQueryReadIntegrationTests(unittest.TestCase): 91 BIG_QUERY_DATASET_ID = 'python_read_table_' 92 93 @classmethod 94 def setUpClass(cls): 95 cls.test_pipeline = TestPipeline(is_integration_test=True) 96 cls.args = cls.test_pipeline.get_full_options_as_args() 97 cls.runner_name = type(cls.test_pipeline.runner).__name__ 98 cls.project = cls.test_pipeline.get_option('project') 99 100 cls.bigquery_client = BigQueryWrapper() 101 cls.dataset_id = '%s%d%s' % ( 102 cls.BIG_QUERY_DATASET_ID, int(time.time()), secrets.token_hex(3)) 103 cls.bigquery_client.get_or_create_dataset(cls.project, cls.dataset_id) 104 _LOGGER.info( 105 "Created dataset %s in project %s", cls.dataset_id, cls.project) 106 107 @classmethod 108 def tearDownClass(cls): 109 request = bigquery.BigqueryDatasetsDeleteRequest( 110 projectId=cls.project, datasetId=cls.dataset_id, deleteContents=True) 111 try: 112 _LOGGER.info( 113 "Deleting dataset %s in project %s", cls.dataset_id, cls.project) 114 cls.bigquery_client.client.datasets.Delete(request) 115 except HttpError: 116 _LOGGER.debug( 117 'Failed to clean up dataset %s in project %s', 118 cls.dataset_id, 119 cls.project) 120 121 122 class ReadTests(BigQueryReadIntegrationTests): 123 TABLE_DATA = [{ 124 'number': 1, 'str': 'abc' 125 }, { 126 'number': 2, 'str': 'def' 127 }, { 128 'number': 3, 'str': u'你好' 129 }, { 130 'number': 4, 'str': u'привет' 131 }] 132 133 @classmethod 134 def setUpClass(cls): 135 super(ReadTests, cls).setUpClass() 136 cls.table_name = 'python_read_table' 137 cls.create_table(cls.table_name) 138 139 table_id = '{}.{}'.format(cls.dataset_id, cls.table_name) 140 cls.query = 'SELECT number, str FROM `%s`' % table_id 141 142 @classmethod 143 def create_table(cls, table_name): 144 table_schema = bigquery.TableSchema() 145 table_field = bigquery.TableFieldSchema() 146 table_field.name = 'number' 147 table_field.type = 'INTEGER' 148 table_schema.fields.append(table_field) 149 table_field = bigquery.TableFieldSchema() 150 table_field.name = 'str' 151 table_field.type = 'STRING' 152 table_schema.fields.append(table_field) 153 table = bigquery.Table( 154 tableReference=bigquery.TableReference( 155 projectId=cls.project, datasetId=cls.dataset_id, 156 tableId=table_name), 157 schema=table_schema) 158 request = bigquery.BigqueryTablesInsertRequest( 159 projectId=cls.project, datasetId=cls.dataset_id, table=table) 160 cls.bigquery_client.client.tables.Insert(request) 161 cls.bigquery_client.insert_rows( 162 cls.project, cls.dataset_id, table_name, cls.TABLE_DATA) 163 164 @skip(['PortableRunner', 'FlinkRunner']) 165 @pytest.mark.it_postcommit 166 def test_native_source(self): 167 with beam.Pipeline(argv=self.args) as p: 168 result = ( 169 p | 'read' >> beam.io.Read( 170 beam.io.BigQuerySource(query=self.query, use_standard_sql=True))) 171 assert_that(result, equal_to(self.TABLE_DATA)) 172 173 @pytest.mark.it_postcommit 174 def test_iobase_source(self): 175 query = StaticValueProvider(str, self.query) 176 with beam.Pipeline(argv=self.args) as p: 177 result = ( 178 p | 'read with value provider query' >> beam.io.ReadFromBigQuery( 179 query=query, use_standard_sql=True, project=self.project)) 180 assert_that(result, equal_to(self.TABLE_DATA)) 181 182 @pytest.mark.it_postcommit 183 def test_table_schema_retrieve(self): 184 the_table = bigquery_tools.BigQueryWrapper().get_table( 185 project_id="apache-beam-testing", 186 dataset_id="beam_bigquery_io_test", 187 table_id="table_schema_retrieve") 188 table = the_table.schema 189 utype = bigquery_schema_tools.\ 190 generate_user_type_from_bq_schema(table) 191 with beam.Pipeline(argv=self.args) as p: 192 result = ( 193 p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( 194 gcs_location="gs://bqio_schema_test", 195 dataset="beam_bigquery_io_test", 196 table="table_schema_retrieve", 197 project="apache-beam-testing", 198 output_type='BEAM_ROW')) 199 assert_that( 200 result, 201 equal_to([ 202 utype( 203 id=1, 204 name='customer1', 205 type='test', 206 times=Timestamp(1633262400)), 207 utype( 208 id=3, 209 name='customer1', 210 type='test', 211 times=Timestamp(1664798400)), 212 utype( 213 id=2, 214 name='customer2', 215 type='test', 216 times=Timestamp(1601726400)), 217 utype( 218 id=4, 219 name='customer2', 220 type='test', 221 times=Timestamp(1570104000)) 222 ])) 223 224 @pytest.mark.it_postcommit 225 def test_table_schema_retrieve_specifying_only_table(self): 226 the_table = bigquery_tools.BigQueryWrapper().get_table( 227 project_id="apache-beam-testing", 228 dataset_id="beam_bigquery_io_test", 229 table_id="table_schema_retrieve") 230 table = the_table.schema 231 utype = bigquery_schema_tools.\ 232 generate_user_type_from_bq_schema(table) 233 with beam.Pipeline(argv=self.args) as p: 234 result = ( 235 p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( 236 gcs_location="gs://bqio_schema_test", 237 table="apache-beam-testing:" 238 "beam_bigquery_io_test." 239 "table_schema_retrieve", 240 output_type='BEAM_ROW')) 241 assert_that( 242 result, 243 equal_to([ 244 utype( 245 id=1, 246 name='customer1', 247 type='test', 248 times=Timestamp(1633262400)), 249 utype( 250 id=3, 251 name='customer1', 252 type='test', 253 times=Timestamp(1664798400)), 254 utype( 255 id=2, 256 name='customer2', 257 type='test', 258 times=Timestamp(1601726400)), 259 utype( 260 id=4, 261 name='customer2', 262 type='test', 263 times=Timestamp(1570104000)) 264 ])) 265 266 @pytest.mark.it_postcommit 267 def test_table_schema_retrieve_with_direct_read(self): 268 the_table = bigquery_tools.BigQueryWrapper().get_table( 269 project_id="apache-beam-testing", 270 dataset_id="beam_bigquery_io_test", 271 table_id="table_schema_retrieve") 272 table = the_table.schema 273 utype = bigquery_schema_tools.\ 274 generate_user_type_from_bq_schema(table) 275 with beam.Pipeline(argv=self.args) as p: 276 result = ( 277 p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( 278 method=beam.io.ReadFromBigQuery.Method.DIRECT_READ, 279 table="apache-beam-testing:" 280 "beam_bigquery_io_test." 281 "table_schema_retrieve", 282 output_type='BEAM_ROW')) 283 assert_that( 284 result, 285 equal_to([ 286 utype( 287 id=1, 288 name='customer1', 289 type='test', 290 times=Timestamp(1633262400)), 291 utype( 292 id=3, 293 name='customer1', 294 type='test', 295 times=Timestamp(1664798400)), 296 utype( 297 id=2, 298 name='customer2', 299 type='test', 300 times=Timestamp(1601726400)), 301 utype( 302 id=4, 303 name='customer2', 304 type='test', 305 times=Timestamp(1570104000)) 306 ])) 307 308 309 class ReadUsingStorageApiTests(BigQueryReadIntegrationTests): 310 TABLE_DATA = [{ 311 'number': 1, 312 'string': u'你好', 313 'time': '12:44:31', 314 'datetime': '2018-12-31 12:44:31', 315 'rec': None 316 }, 317 { 318 'number': 4, 319 'string': u'привет', 320 'time': '12:44:31', 321 'datetime': '2018-12-31 12:44:31', 322 'rec': { 323 'rec_datetime': '2018-12-31 12:44:31', 324 'rec_rec': { 325 'rec_rec_datetime': '2018-12-31 12:44:31' 326 } 327 }, 328 }] 329 330 @classmethod 331 def setUpClass(cls): 332 super(ReadUsingStorageApiTests, cls).setUpClass() 333 cls.table_name = 'python_read_table' 334 cls._create_table(cls.table_name) 335 336 table_id = '{}.{}'.format(cls.dataset_id, cls.table_name) 337 cls.query = 'SELECT * FROM `%s`' % table_id 338 339 # Materializing the newly created Table to ensure the Read API can stream. 340 cls.temp_table_reference = cls._execute_query(cls.project, cls.query) 341 342 @classmethod 343 def tearDownClass(cls): 344 cls.bigquery_client.clean_up_temporary_dataset(cls.project) 345 super(ReadUsingStorageApiTests, cls).tearDownClass() 346 347 @classmethod 348 def _create_table(cls, table_name): 349 table_schema = bigquery.TableSchema() 350 351 number = bigquery.TableFieldSchema() 352 number.name = 'number' 353 number.type = 'INTEGER' 354 table_schema.fields.append(number) 355 356 string = bigquery.TableFieldSchema() 357 string.name = 'string' 358 string.type = 'STRING' 359 table_schema.fields.append(string) 360 361 time = bigquery.TableFieldSchema() 362 time.name = 'time' 363 time.type = 'TIME' 364 table_schema.fields.append(time) 365 366 datetime = bigquery.TableFieldSchema() 367 datetime.name = 'datetime' 368 datetime.type = 'DATETIME' 369 table_schema.fields.append(datetime) 370 371 rec = bigquery.TableFieldSchema() 372 rec.name = 'rec' 373 rec.type = 'RECORD' 374 rec_datetime = bigquery.TableFieldSchema() 375 rec_datetime.name = 'rec_datetime' 376 rec_datetime.type = 'DATETIME' 377 rec.fields.append(rec_datetime) 378 rec_rec = bigquery.TableFieldSchema() 379 rec_rec.name = 'rec_rec' 380 rec_rec.type = 'RECORD' 381 rec_rec_datetime = bigquery.TableFieldSchema() 382 rec_rec_datetime.name = 'rec_rec_datetime' 383 rec_rec_datetime.type = 'DATETIME' 384 rec_rec.fields.append(rec_rec_datetime) 385 rec.fields.append(rec_rec) 386 table_schema.fields.append(rec) 387 388 table = bigquery.Table( 389 tableReference=bigquery.TableReference( 390 projectId=cls.project, datasetId=cls.dataset_id, 391 tableId=table_name), 392 schema=table_schema) 393 request = bigquery.BigqueryTablesInsertRequest( 394 projectId=cls.project, datasetId=cls.dataset_id, table=table) 395 cls.bigquery_client.client.tables.Insert(request) 396 cls.bigquery_client.insert_rows( 397 cls.project, cls.dataset_id, table_name, cls.TABLE_DATA) 398 399 @classmethod 400 def _setup_temporary_dataset(cls, project, query): 401 location = cls.bigquery_client.get_query_location(project, query, False) 402 cls.bigquery_client.create_temporary_dataset(project, location) 403 404 @classmethod 405 def _execute_query(cls, project, query): 406 query_job_name = bigquery_tools.generate_bq_job_name( 407 'materializing_table_before_reading', 408 str(uuid.uuid4())[0:10], 409 bigquery_tools.BigQueryJobTypes.QUERY, 410 '%d_%s' % (int(time.time()), secrets.token_hex(3))) 411 cls._setup_temporary_dataset(cls.project, cls.query) 412 job = cls.bigquery_client._start_query_job( 413 project, 414 query, 415 use_legacy_sql=False, 416 flatten_results=False, 417 job_id=query_job_name, 418 priority=beam.io.BigQueryQueryPriority.BATCH) 419 job_ref = job.jobReference 420 cls.bigquery_client.wait_for_bq_job(job_ref, max_retries=0) 421 return cls.bigquery_client._get_temp_table(project) 422 423 @pytest.mark.it_postcommit 424 def test_iobase_source(self): 425 EXPECTED_TABLE_DATA = [ 426 { 427 'number': 1, 428 'string': u'你好', 429 'time': datetime.time(12, 44, 31), 430 'datetime': '2018-12-31T12:44:31', 431 'rec': None, 432 }, 433 { 434 'number': 4, 435 'string': u'привет', 436 'time': datetime.time(12, 44, 31), 437 'datetime': '2018-12-31T12:44:31', 438 'rec': { 439 'rec_datetime': '2018-12-31T12:44:31', 440 'rec_rec': { 441 'rec_rec_datetime': '2018-12-31T12:44:31', 442 } 443 }, 444 } 445 ] 446 with beam.Pipeline(argv=self.args) as p: 447 result = ( 448 p | 'Read with BigQuery Storage API' >> beam.io.ReadFromBigQuery( 449 method=beam.io.ReadFromBigQuery.Method.DIRECT_READ, 450 table=self.temp_table_reference)) 451 assert_that(result, equal_to(EXPECTED_TABLE_DATA)) 452 453 @pytest.mark.it_postcommit 454 def test_iobase_source_with_native_datetime(self): 455 EXPECTED_TABLE_DATA = [ 456 { 457 'number': 1, 458 'string': u'你好', 459 'time': datetime.time(12, 44, 31), 460 'datetime': datetime.datetime(2018, 12, 31, 12, 44, 31), 461 'rec': None, 462 }, 463 { 464 'number': 4, 465 'string': u'привет', 466 'time': datetime.time(12, 44, 31), 467 'datetime': datetime.datetime(2018, 12, 31, 12, 44, 31), 468 'rec': { 469 'rec_datetime': datetime.datetime(2018, 12, 31, 12, 44, 31), 470 'rec_rec': { 471 'rec_rec_datetime': datetime.datetime( 472 2018, 12, 31, 12, 44, 31) 473 } 474 }, 475 } 476 ] 477 with beam.Pipeline(argv=self.args) as p: 478 result = ( 479 p | 'Read with BigQuery Storage API' >> beam.io.ReadFromBigQuery( 480 method=beam.io.ReadFromBigQuery.Method.DIRECT_READ, 481 table=self.temp_table_reference, 482 use_native_datetime=True)) 483 assert_that(result, equal_to(EXPECTED_TABLE_DATA)) 484 485 @pytest.mark.it_postcommit 486 def test_iobase_source_with_column_selection(self): 487 EXPECTED_TABLE_DATA = [{'number': 1}, {'number': 4}] 488 with beam.Pipeline(argv=self.args) as p: 489 result = ( 490 p | 'Read with BigQuery Storage API' >> beam.io.ReadFromBigQuery( 491 method=beam.io.ReadFromBigQuery.Method.DIRECT_READ, 492 table=self.temp_table_reference, 493 selected_fields=['number'])) 494 assert_that(result, equal_to(EXPECTED_TABLE_DATA)) 495 496 @pytest.mark.it_postcommit 497 def test_iobase_source_with_row_restriction(self): 498 EXPECTED_TABLE_DATA = [{ 499 'number': 1, 500 'string': u'你好', 501 'time': datetime.time(12, 44, 31), 502 'datetime': datetime.datetime(2018, 12, 31, 12, 44, 31), 503 'rec': None 504 }] 505 with beam.Pipeline(argv=self.args) as p: 506 result = ( 507 p | 'Read with BigQuery Storage API' >> beam.io.ReadFromBigQuery( 508 method=beam.io.ReadFromBigQuery.Method.DIRECT_READ, 509 table=self.temp_table_reference, 510 row_restriction='number < 2', 511 use_native_datetime=True)) 512 assert_that(result, equal_to(EXPECTED_TABLE_DATA)) 513 514 @pytest.mark.it_postcommit 515 def test_iobase_source_with_column_selection_and_row_restriction(self): 516 EXPECTED_TABLE_DATA = [{'string': u'привет'}] 517 with beam.Pipeline(argv=self.args) as p: 518 result = ( 519 p | 'Read with BigQuery Storage API' >> beam.io.ReadFromBigQuery( 520 method=beam.io.ReadFromBigQuery.Method.DIRECT_READ, 521 table=self.temp_table_reference, 522 row_restriction='number > 2', 523 selected_fields=['string'])) 524 assert_that(result, equal_to(EXPECTED_TABLE_DATA)) 525 526 @pytest.mark.it_postcommit 527 def test_iobase_source_with_very_selective_filters(self): 528 with beam.Pipeline(argv=self.args) as p: 529 result = ( 530 p | 'Read with BigQuery Storage API' >> beam.io.ReadFromBigQuery( 531 method=beam.io.ReadFromBigQuery.Method.DIRECT_READ, 532 project=self.temp_table_reference.projectId, 533 dataset=self.temp_table_reference.datasetId, 534 table=self.temp_table_reference.tableId, 535 row_restriction='number > 4', 536 selected_fields=['string'])) 537 assert_that(result, equal_to([])) 538 539 @pytest.mark.it_postcommit 540 def test_iobase_source_with_query(self): 541 EXPECTED_TABLE_DATA = [ 542 { 543 'number': 1, 544 'string': u'你好', 545 'time': datetime.time(12, 44, 31), 546 'datetime': datetime.datetime(2018, 12, 31, 12, 44, 31), 547 'rec': None, 548 }, 549 { 550 'number': 4, 551 'string': u'привет', 552 'time': datetime.time(12, 44, 31), 553 'datetime': datetime.datetime(2018, 12, 31, 12, 44, 31), 554 'rec': { 555 'rec_datetime': datetime.datetime(2018, 12, 31, 12, 44, 31), 556 'rec_rec': { 557 'rec_rec_datetime': datetime.datetime( 558 2018, 12, 31, 12, 44, 31) 559 } 560 }, 561 } 562 ] 563 query = StaticValueProvider(str, self.query) 564 with beam.Pipeline(argv=self.args) as p: 565 result = ( 566 p | 'Direct read with query' >> beam.io.ReadFromBigQuery( 567 method=beam.io.ReadFromBigQuery.Method.DIRECT_READ, 568 use_native_datetime=True, 569 use_standard_sql=True, 570 project=self.project, 571 query=query)) 572 assert_that(result, equal_to(EXPECTED_TABLE_DATA)) 573 574 @pytest.mark.it_postcommit 575 def test_iobase_source_with_query_and_filters(self): 576 EXPECTED_TABLE_DATA = [{'string': u'привет'}] 577 query = StaticValueProvider(str, self.query) 578 with beam.Pipeline(argv=self.args) as p: 579 result = ( 580 p | 'Direct read with query' >> beam.io.ReadFromBigQuery( 581 method=beam.io.ReadFromBigQuery.Method.DIRECT_READ, 582 row_restriction='number > 2', 583 selected_fields=['string'], 584 use_standard_sql=True, 585 project=self.project, 586 query=query)) 587 assert_that(result, equal_to(EXPECTED_TABLE_DATA)) 588 589 590 class ReadNewTypesTests(BigQueryReadIntegrationTests): 591 @classmethod 592 def setUpClass(cls): 593 super(ReadNewTypesTests, cls).setUpClass() 594 cls.table_name = 'python_new_types' 595 cls.create_table(cls.table_name) 596 597 table_id = '{}.{}'.format(cls.dataset_id, cls.table_name) 598 cls.query = 'SELECT float, numeric, bytes, date, time, datetime,' \ 599 'timestamp, geo FROM `%s`' % table_id 600 601 @classmethod 602 def create_table(cls, table_name): 603 table_schema = bigquery.TableSchema() 604 table_field = bigquery.TableFieldSchema() 605 table_field.name = 'float' 606 table_field.type = 'FLOAT' 607 table_schema.fields.append(table_field) 608 table_field = bigquery.TableFieldSchema() 609 table_field.name = 'numeric' 610 table_field.type = 'NUMERIC' 611 table_schema.fields.append(table_field) 612 table_field = bigquery.TableFieldSchema() 613 table_field.name = 'bytes' 614 table_field.type = 'BYTES' 615 table_schema.fields.append(table_field) 616 table_field = bigquery.TableFieldSchema() 617 table_field.name = 'date' 618 table_field.type = 'DATE' 619 table_schema.fields.append(table_field) 620 table_field = bigquery.TableFieldSchema() 621 table_field.name = 'time' 622 table_field.type = 'TIME' 623 table_schema.fields.append(table_field) 624 table_field = bigquery.TableFieldSchema() 625 table_field.name = 'datetime' 626 table_field.type = 'DATETIME' 627 table_schema.fields.append(table_field) 628 table_field = bigquery.TableFieldSchema() 629 table_field.name = 'timestamp' 630 table_field.type = 'TIMESTAMP' 631 table_schema.fields.append(table_field) 632 table_field = bigquery.TableFieldSchema() 633 table_field.name = 'geo' 634 table_field.type = 'GEOGRAPHY' 635 table_schema.fields.append(table_field) 636 table = bigquery.Table( 637 tableReference=bigquery.TableReference( 638 projectId=cls.project, datasetId=cls.dataset_id, 639 tableId=table_name), 640 schema=table_schema) 641 request = bigquery.BigqueryTablesInsertRequest( 642 projectId=cls.project, datasetId=cls.dataset_id, table=table) 643 cls.bigquery_client.client.tables.Insert(request) 644 row_data = { 645 'float': 0.33, 646 'numeric': Decimal('10'), 647 'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'), 648 'date': '3000-12-31', 649 'time': '23:59:59', 650 'datetime': '2018-12-31T12:44:31', 651 'timestamp': '2018-12-31 12:44:31.744957 UTC', 652 'geo': 'POINT(30 10)' 653 } 654 655 table_data = [row_data] 656 # add rows with only one key value pair and None values for all other keys 657 for key, value in row_data.items(): 658 table_data.append({key: value}) 659 660 cls.bigquery_client.insert_rows( 661 cls.project, cls.dataset_id, table_name, table_data) 662 663 def get_expected_data(self, native=True): 664 byts = b'\xab\xac' 665 expected_row = { 666 'float': 0.33, 667 'numeric': Decimal('10'), 668 'bytes': base64.b64encode(byts) if native else byts, 669 'date': '3000-12-31', 670 'time': '23:59:59', 671 'datetime': '2018-12-31T12:44:31', 672 'timestamp': '2018-12-31 12:44:31.744957 UTC', 673 'geo': 'POINT(30 10)' 674 } 675 676 expected_data = [expected_row] 677 678 # add rows with only one key value pair and None values for all other keys 679 for key, value in expected_row.items(): 680 row = {k: None for k in expected_row} 681 row[key] = value 682 expected_data.append(row) 683 684 return expected_data 685 686 @skip(['PortableRunner', 'FlinkRunner']) 687 @pytest.mark.it_postcommit 688 def test_native_source(self): 689 with beam.Pipeline(argv=self.args) as p: 690 result = ( 691 p 692 | 'read' >> beam.io.Read( 693 beam.io.BigQuerySource(query=self.query, use_standard_sql=True))) 694 assert_that(result, equal_to(self.get_expected_data())) 695 696 @pytest.mark.it_postcommit 697 def test_iobase_source(self): 698 with beam.Pipeline(argv=self.args) as p: 699 result = ( 700 p 701 | 'read' >> beam.io.ReadFromBigQuery( 702 query=self.query, 703 use_standard_sql=True, 704 project=self.project, 705 bigquery_job_labels={'launcher': 'apache_beam_tests'}) 706 | beam.Map(datetime_to_utc)) 707 assert_that(result, equal_to(self.get_expected_data(native=False))) 708 709 710 class ReadAllBQTests(BigQueryReadIntegrationTests): 711 TABLE_DATA_1 = [{ 712 'number': 1, 'str': 'abc' 713 }, { 714 'number': 2, 'str': 'def' 715 }, { 716 'number': 3, 'str': u'你好' 717 }, { 718 'number': 4, 'str': u'привет' 719 }] 720 721 TABLE_DATA_2 = [{ 722 'number': 10, 'str': 'abcd' 723 }, { 724 'number': 20, 'str': 'defg' 725 }, { 726 'number': 30, 'str': u'你好' 727 }, { 728 'number': 40, 'str': u'привет' 729 }] 730 731 TABLE_DATA_3 = [{'number': 10, 'str': 'abcde', 'extra': 3}] 732 733 @classmethod 734 def setUpClass(cls): 735 super(ReadAllBQTests, cls).setUpClass() 736 cls.SCHEMA_BQ = cls.create_bq_schema() 737 cls.SCHEMA_BQ_WITH_EXTRA = cls.create_bq_schema(True) 738 739 cls.table_name1 = 'python_rd_table_1' 740 cls.table_schema1 = cls.create_table( 741 cls.table_name1, cls.TABLE_DATA_1, cls.SCHEMA_BQ) 742 table_id1 = '{}.{}'.format(cls.dataset_id, cls.table_name1) 743 cls.query1 = 'SELECT number, str FROM `%s`' % table_id1 744 745 cls.table_name2 = 'python_rd_table_2' 746 cls.table_schema2 = cls.create_table( 747 cls.table_name2, cls.TABLE_DATA_2, cls.SCHEMA_BQ) 748 table_id2 = '{}.{}'.format(cls.dataset_id, cls.table_name2) 749 cls.query2 = 'SELECT number, str FROM %s' % table_id2 750 751 cls.table_name3 = 'python_rd_table_3' 752 cls.table_schema3 = cls.create_table( 753 cls.table_name3, cls.TABLE_DATA_3, cls.SCHEMA_BQ_WITH_EXTRA) 754 table_id3 = '{}.{}'.format(cls.dataset_id, cls.table_name3) 755 cls.query3 = 'SELECT number, str, extra FROM `%s`' % table_id3 756 757 @classmethod 758 def create_table(cls, table_name, data, table_schema): 759 table = bigquery.Table( 760 tableReference=bigquery.TableReference( 761 projectId=cls.project, datasetId=cls.dataset_id, 762 tableId=table_name), 763 schema=table_schema) 764 request = bigquery.BigqueryTablesInsertRequest( 765 projectId=cls.project, datasetId=cls.dataset_id, table=table) 766 cls.bigquery_client.client.tables.Insert(request) 767 cls.bigquery_client.insert_rows( 768 cls.project, cls.dataset_id, table_name, data) 769 return table_schema 770 771 @classmethod 772 def create_bq_schema(cls, with_extra=False): 773 table_schema = bigquery.TableSchema() 774 table_field = bigquery.TableFieldSchema() 775 table_field.name = 'number' 776 table_field.type = 'INTEGER' 777 table_field.mode = 'NULLABLE' 778 table_schema.fields.append(table_field) 779 table_field = bigquery.TableFieldSchema() 780 table_field.name = 'str' 781 table_field.type = 'STRING' 782 table_field.mode = 'NULLABLE' 783 table_schema.fields.append(table_field) 784 if with_extra: 785 table_field = bigquery.TableFieldSchema() 786 table_field.name = 'extra' 787 table_field.type = 'INTEGER' 788 table_field.mode = 'NULLABLE' 789 table_schema.fields.append(table_field) 790 return table_schema 791 792 @skip(['PortableRunner', 'FlinkRunner']) 793 @pytest.mark.it_postcommit 794 def test_read_queries(self): 795 # TODO(https://github.com/apache/beam/issues/20610): Remove experiment when 796 # tests run on r_v2. 797 args = self.args + ["--experiments=use_runner_v2"] 798 with beam.Pipeline(argv=args) as p: 799 result = ( 800 p 801 | beam.Create([ 802 beam.io.ReadFromBigQueryRequest(query=self.query1), 803 beam.io.ReadFromBigQueryRequest( 804 query=self.query2, use_standard_sql=False), 805 beam.io.ReadFromBigQueryRequest( 806 table='%s.%s' % (self.dataset_id, self.table_name3)) 807 ]) 808 | beam.io.ReadAllFromBigQuery()) 809 assert_that( 810 result, 811 equal_to(self.TABLE_DATA_1 + self.TABLE_DATA_2 + self.TABLE_DATA_3)) 812 813 814 class ReadInteractiveRunnerTests(BigQueryReadIntegrationTests): 815 @skip(['PortableRunner', 'FlinkRunner']) 816 @pytest.mark.it_postcommit 817 def test_read_in_interactive_runner(self): 818 p = beam.Pipeline(InteractiveRunner(), argv=self.args) 819 pcoll = p | beam.io.ReadFromBigQuery(query="SELECT 1") 820 result = interactive_beam.collect(pcoll) 821 assert result.iloc[0, 0] == 1 822 823 824 if __name__ == '__main__': 825 logging.getLogger().setLevel(logging.INFO) 826 unittest.main()