github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_read_it_test.py (about)

     1  #!/usr/bin/env python
     2  # -*- coding: utf-8 -*-
     3  #
     4  # Licensed to the Apache Software Foundation (ASF) under one or more
     5  # contributor license agreements.  See the NOTICE file distributed with
     6  # this work for additional information regarding copyright ownership.
     7  # The ASF licenses this file to You under the Apache License, Version 2.0
     8  # (the "License"); you may not use this file except in compliance with
     9  # the License.  You may obtain a copy of the License at
    10  #
    11  #    http://www.apache.org/licenses/LICENSE-2.0
    12  #
    13  # Unless required by applicable law or agreed to in writing, software
    14  # distributed under the License is distributed on an "AS IS" BASIS,
    15  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16  # See the License for the specific language governing permissions and
    17  # limitations under the License.
    18  #
    19  
    20  """Unit tests for BigQuery sources and sinks."""
    21  # pytype: skip-file
    22  
    23  import base64
    24  import datetime
    25  import logging
    26  import secrets
    27  import time
    28  import unittest
    29  import uuid
    30  from decimal import Decimal
    31  from functools import wraps
    32  
    33  import pytest
    34  
    35  import apache_beam as beam
    36  import apache_beam.io.gcp.bigquery
    37  from apache_beam.io.gcp import bigquery_schema_tools
    38  from apache_beam.io.gcp import bigquery_tools
    39  from apache_beam.io.gcp.bigquery_tools import BigQueryWrapper
    40  from apache_beam.io.gcp.internal.clients import bigquery
    41  from apache_beam.options.value_provider import StaticValueProvider
    42  from apache_beam.runners.interactive import interactive_beam
    43  from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
    44  from apache_beam.testing.test_pipeline import TestPipeline
    45  from apache_beam.testing.util import assert_that
    46  from apache_beam.testing.util import equal_to
    47  from apache_beam.utils.timestamp import Timestamp
    48  
    49  # Protect against environments where bigquery library is not available.
    50  # pylint: disable=wrong-import-order, wrong-import-position
    51  try:
    52    from apitools.base.py.exceptions import HttpError
    53  except ImportError:
    54    HttpError = None
    55  # pylint: enable=wrong-import-order, wrong-import-position
    56  
    57  _LOGGER = logging.getLogger(__name__)
    58  
    59  
    60  def skip(runners):
    61    if not isinstance(runners, list):
    62      runners = [runners]
    63  
    64    def inner(fn):
    65      @wraps(fn)
    66      def wrapped(self):
    67        if self.runner_name in runners:
    68          self.skipTest(
    69              'This test doesn\'t work on these runners: {}'.format(runners))
    70        else:
    71          return fn(self)
    72  
    73      return wrapped
    74  
    75    return inner
    76  
    77  
    78  def datetime_to_utc(element):
    79    for k, v in element.items():
    80      if isinstance(v, (datetime.time, datetime.date)):
    81        element[k] = str(v)
    82      if isinstance(v, datetime.datetime) and v.tzinfo:
    83        # For datetime objects, we'll
    84        offset = v.utcoffset()
    85        utc_dt = (v - offset).strftime('%Y-%m-%d %H:%M:%S.%f UTC')
    86        element[k] = utc_dt
    87    return element
    88  
    89  
    90  class BigQueryReadIntegrationTests(unittest.TestCase):
    91    BIG_QUERY_DATASET_ID = 'python_read_table_'
    92  
    93    @classmethod
    94    def setUpClass(cls):
    95      cls.test_pipeline = TestPipeline(is_integration_test=True)
    96      cls.args = cls.test_pipeline.get_full_options_as_args()
    97      cls.runner_name = type(cls.test_pipeline.runner).__name__
    98      cls.project = cls.test_pipeline.get_option('project')
    99  
   100      cls.bigquery_client = BigQueryWrapper()
   101      cls.dataset_id = '%s%d%s' % (
   102          cls.BIG_QUERY_DATASET_ID, int(time.time()), secrets.token_hex(3))
   103      cls.bigquery_client.get_or_create_dataset(cls.project, cls.dataset_id)
   104      _LOGGER.info(
   105          "Created dataset %s in project %s", cls.dataset_id, cls.project)
   106  
   107    @classmethod
   108    def tearDownClass(cls):
   109      request = bigquery.BigqueryDatasetsDeleteRequest(
   110          projectId=cls.project, datasetId=cls.dataset_id, deleteContents=True)
   111      try:
   112        _LOGGER.info(
   113            "Deleting dataset %s in project %s", cls.dataset_id, cls.project)
   114        cls.bigquery_client.client.datasets.Delete(request)
   115      except HttpError:
   116        _LOGGER.debug(
   117            'Failed to clean up dataset %s in project %s',
   118            cls.dataset_id,
   119            cls.project)
   120  
   121  
   122  class ReadTests(BigQueryReadIntegrationTests):
   123    TABLE_DATA = [{
   124        'number': 1, 'str': 'abc'
   125    }, {
   126        'number': 2, 'str': 'def'
   127    }, {
   128        'number': 3, 'str': u'你好'
   129    }, {
   130        'number': 4, 'str': u'привет'
   131    }]
   132  
   133    @classmethod
   134    def setUpClass(cls):
   135      super(ReadTests, cls).setUpClass()
   136      cls.table_name = 'python_read_table'
   137      cls.create_table(cls.table_name)
   138  
   139      table_id = '{}.{}'.format(cls.dataset_id, cls.table_name)
   140      cls.query = 'SELECT number, str FROM `%s`' % table_id
   141  
   142    @classmethod
   143    def create_table(cls, table_name):
   144      table_schema = bigquery.TableSchema()
   145      table_field = bigquery.TableFieldSchema()
   146      table_field.name = 'number'
   147      table_field.type = 'INTEGER'
   148      table_schema.fields.append(table_field)
   149      table_field = bigquery.TableFieldSchema()
   150      table_field.name = 'str'
   151      table_field.type = 'STRING'
   152      table_schema.fields.append(table_field)
   153      table = bigquery.Table(
   154          tableReference=bigquery.TableReference(
   155              projectId=cls.project, datasetId=cls.dataset_id,
   156              tableId=table_name),
   157          schema=table_schema)
   158      request = bigquery.BigqueryTablesInsertRequest(
   159          projectId=cls.project, datasetId=cls.dataset_id, table=table)
   160      cls.bigquery_client.client.tables.Insert(request)
   161      cls.bigquery_client.insert_rows(
   162          cls.project, cls.dataset_id, table_name, cls.TABLE_DATA)
   163  
   164    @skip(['PortableRunner', 'FlinkRunner'])
   165    @pytest.mark.it_postcommit
   166    def test_native_source(self):
   167      with beam.Pipeline(argv=self.args) as p:
   168        result = (
   169            p | 'read' >> beam.io.Read(
   170                beam.io.BigQuerySource(query=self.query, use_standard_sql=True)))
   171        assert_that(result, equal_to(self.TABLE_DATA))
   172  
   173    @pytest.mark.it_postcommit
   174    def test_iobase_source(self):
   175      query = StaticValueProvider(str, self.query)
   176      with beam.Pipeline(argv=self.args) as p:
   177        result = (
   178            p | 'read with value provider query' >> beam.io.ReadFromBigQuery(
   179                query=query, use_standard_sql=True, project=self.project))
   180        assert_that(result, equal_to(self.TABLE_DATA))
   181  
   182    @pytest.mark.it_postcommit
   183    def test_table_schema_retrieve(self):
   184      the_table = bigquery_tools.BigQueryWrapper().get_table(
   185          project_id="apache-beam-testing",
   186          dataset_id="beam_bigquery_io_test",
   187          table_id="table_schema_retrieve")
   188      table = the_table.schema
   189      utype = bigquery_schema_tools.\
   190          generate_user_type_from_bq_schema(table)
   191      with beam.Pipeline(argv=self.args) as p:
   192        result = (
   193            p | apache_beam.io.gcp.bigquery.ReadFromBigQuery(
   194                gcs_location="gs://bqio_schema_test",
   195                dataset="beam_bigquery_io_test",
   196                table="table_schema_retrieve",
   197                project="apache-beam-testing",
   198                output_type='BEAM_ROW'))
   199        assert_that(
   200            result,
   201            equal_to([
   202                utype(
   203                    id=1,
   204                    name='customer1',
   205                    type='test',
   206                    times=Timestamp(1633262400)),
   207                utype(
   208                    id=3,
   209                    name='customer1',
   210                    type='test',
   211                    times=Timestamp(1664798400)),
   212                utype(
   213                    id=2,
   214                    name='customer2',
   215                    type='test',
   216                    times=Timestamp(1601726400)),
   217                utype(
   218                    id=4,
   219                    name='customer2',
   220                    type='test',
   221                    times=Timestamp(1570104000))
   222            ]))
   223  
   224    @pytest.mark.it_postcommit
   225    def test_table_schema_retrieve_specifying_only_table(self):
   226      the_table = bigquery_tools.BigQueryWrapper().get_table(
   227          project_id="apache-beam-testing",
   228          dataset_id="beam_bigquery_io_test",
   229          table_id="table_schema_retrieve")
   230      table = the_table.schema
   231      utype = bigquery_schema_tools.\
   232          generate_user_type_from_bq_schema(table)
   233      with beam.Pipeline(argv=self.args) as p:
   234        result = (
   235            p | apache_beam.io.gcp.bigquery.ReadFromBigQuery(
   236                gcs_location="gs://bqio_schema_test",
   237                table="apache-beam-testing:"
   238                "beam_bigquery_io_test."
   239                "table_schema_retrieve",
   240                output_type='BEAM_ROW'))
   241        assert_that(
   242            result,
   243            equal_to([
   244                utype(
   245                    id=1,
   246                    name='customer1',
   247                    type='test',
   248                    times=Timestamp(1633262400)),
   249                utype(
   250                    id=3,
   251                    name='customer1',
   252                    type='test',
   253                    times=Timestamp(1664798400)),
   254                utype(
   255                    id=2,
   256                    name='customer2',
   257                    type='test',
   258                    times=Timestamp(1601726400)),
   259                utype(
   260                    id=4,
   261                    name='customer2',
   262                    type='test',
   263                    times=Timestamp(1570104000))
   264            ]))
   265  
   266    @pytest.mark.it_postcommit
   267    def test_table_schema_retrieve_with_direct_read(self):
   268      the_table = bigquery_tools.BigQueryWrapper().get_table(
   269          project_id="apache-beam-testing",
   270          dataset_id="beam_bigquery_io_test",
   271          table_id="table_schema_retrieve")
   272      table = the_table.schema
   273      utype = bigquery_schema_tools.\
   274          generate_user_type_from_bq_schema(table)
   275      with beam.Pipeline(argv=self.args) as p:
   276        result = (
   277            p | apache_beam.io.gcp.bigquery.ReadFromBigQuery(
   278                method=beam.io.ReadFromBigQuery.Method.DIRECT_READ,
   279                table="apache-beam-testing:"
   280                "beam_bigquery_io_test."
   281                "table_schema_retrieve",
   282                output_type='BEAM_ROW'))
   283        assert_that(
   284            result,
   285            equal_to([
   286                utype(
   287                    id=1,
   288                    name='customer1',
   289                    type='test',
   290                    times=Timestamp(1633262400)),
   291                utype(
   292                    id=3,
   293                    name='customer1',
   294                    type='test',
   295                    times=Timestamp(1664798400)),
   296                utype(
   297                    id=2,
   298                    name='customer2',
   299                    type='test',
   300                    times=Timestamp(1601726400)),
   301                utype(
   302                    id=4,
   303                    name='customer2',
   304                    type='test',
   305                    times=Timestamp(1570104000))
   306            ]))
   307  
   308  
   309  class ReadUsingStorageApiTests(BigQueryReadIntegrationTests):
   310    TABLE_DATA = [{
   311        'number': 1,
   312        'string': u'你好',
   313        'time': '12:44:31',
   314        'datetime': '2018-12-31 12:44:31',
   315        'rec': None
   316    },
   317                  {
   318                      'number': 4,
   319                      'string': u'привет',
   320                      'time': '12:44:31',
   321                      'datetime': '2018-12-31 12:44:31',
   322                      'rec': {
   323                          'rec_datetime': '2018-12-31 12:44:31',
   324                          'rec_rec': {
   325                              'rec_rec_datetime': '2018-12-31 12:44:31'
   326                          }
   327                      },
   328                  }]
   329  
   330    @classmethod
   331    def setUpClass(cls):
   332      super(ReadUsingStorageApiTests, cls).setUpClass()
   333      cls.table_name = 'python_read_table'
   334      cls._create_table(cls.table_name)
   335  
   336      table_id = '{}.{}'.format(cls.dataset_id, cls.table_name)
   337      cls.query = 'SELECT * FROM `%s`' % table_id
   338  
   339      # Materializing the newly created Table to ensure the Read API can stream.
   340      cls.temp_table_reference = cls._execute_query(cls.project, cls.query)
   341  
   342    @classmethod
   343    def tearDownClass(cls):
   344      cls.bigquery_client.clean_up_temporary_dataset(cls.project)
   345      super(ReadUsingStorageApiTests, cls).tearDownClass()
   346  
   347    @classmethod
   348    def _create_table(cls, table_name):
   349      table_schema = bigquery.TableSchema()
   350  
   351      number = bigquery.TableFieldSchema()
   352      number.name = 'number'
   353      number.type = 'INTEGER'
   354      table_schema.fields.append(number)
   355  
   356      string = bigquery.TableFieldSchema()
   357      string.name = 'string'
   358      string.type = 'STRING'
   359      table_schema.fields.append(string)
   360  
   361      time = bigquery.TableFieldSchema()
   362      time.name = 'time'
   363      time.type = 'TIME'
   364      table_schema.fields.append(time)
   365  
   366      datetime = bigquery.TableFieldSchema()
   367      datetime.name = 'datetime'
   368      datetime.type = 'DATETIME'
   369      table_schema.fields.append(datetime)
   370  
   371      rec = bigquery.TableFieldSchema()
   372      rec.name = 'rec'
   373      rec.type = 'RECORD'
   374      rec_datetime = bigquery.TableFieldSchema()
   375      rec_datetime.name = 'rec_datetime'
   376      rec_datetime.type = 'DATETIME'
   377      rec.fields.append(rec_datetime)
   378      rec_rec = bigquery.TableFieldSchema()
   379      rec_rec.name = 'rec_rec'
   380      rec_rec.type = 'RECORD'
   381      rec_rec_datetime = bigquery.TableFieldSchema()
   382      rec_rec_datetime.name = 'rec_rec_datetime'
   383      rec_rec_datetime.type = 'DATETIME'
   384      rec_rec.fields.append(rec_rec_datetime)
   385      rec.fields.append(rec_rec)
   386      table_schema.fields.append(rec)
   387  
   388      table = bigquery.Table(
   389          tableReference=bigquery.TableReference(
   390              projectId=cls.project, datasetId=cls.dataset_id,
   391              tableId=table_name),
   392          schema=table_schema)
   393      request = bigquery.BigqueryTablesInsertRequest(
   394          projectId=cls.project, datasetId=cls.dataset_id, table=table)
   395      cls.bigquery_client.client.tables.Insert(request)
   396      cls.bigquery_client.insert_rows(
   397          cls.project, cls.dataset_id, table_name, cls.TABLE_DATA)
   398  
   399    @classmethod
   400    def _setup_temporary_dataset(cls, project, query):
   401      location = cls.bigquery_client.get_query_location(project, query, False)
   402      cls.bigquery_client.create_temporary_dataset(project, location)
   403  
   404    @classmethod
   405    def _execute_query(cls, project, query):
   406      query_job_name = bigquery_tools.generate_bq_job_name(
   407          'materializing_table_before_reading',
   408          str(uuid.uuid4())[0:10],
   409          bigquery_tools.BigQueryJobTypes.QUERY,
   410          '%d_%s' % (int(time.time()), secrets.token_hex(3)))
   411      cls._setup_temporary_dataset(cls.project, cls.query)
   412      job = cls.bigquery_client._start_query_job(
   413          project,
   414          query,
   415          use_legacy_sql=False,
   416          flatten_results=False,
   417          job_id=query_job_name,
   418          priority=beam.io.BigQueryQueryPriority.BATCH)
   419      job_ref = job.jobReference
   420      cls.bigquery_client.wait_for_bq_job(job_ref, max_retries=0)
   421      return cls.bigquery_client._get_temp_table(project)
   422  
   423    @pytest.mark.it_postcommit
   424    def test_iobase_source(self):
   425      EXPECTED_TABLE_DATA = [
   426          {
   427              'number': 1,
   428              'string': u'你好',
   429              'time': datetime.time(12, 44, 31),
   430              'datetime': '2018-12-31T12:44:31',
   431              'rec': None,
   432          },
   433          {
   434              'number': 4,
   435              'string': u'привет',
   436              'time': datetime.time(12, 44, 31),
   437              'datetime': '2018-12-31T12:44:31',
   438              'rec': {
   439                  'rec_datetime': '2018-12-31T12:44:31',
   440                  'rec_rec': {
   441                      'rec_rec_datetime': '2018-12-31T12:44:31',
   442                  }
   443              },
   444          }
   445      ]
   446      with beam.Pipeline(argv=self.args) as p:
   447        result = (
   448            p | 'Read with BigQuery Storage API' >> beam.io.ReadFromBigQuery(
   449                method=beam.io.ReadFromBigQuery.Method.DIRECT_READ,
   450                table=self.temp_table_reference))
   451        assert_that(result, equal_to(EXPECTED_TABLE_DATA))
   452  
   453    @pytest.mark.it_postcommit
   454    def test_iobase_source_with_native_datetime(self):
   455      EXPECTED_TABLE_DATA = [
   456          {
   457              'number': 1,
   458              'string': u'你好',
   459              'time': datetime.time(12, 44, 31),
   460              'datetime': datetime.datetime(2018, 12, 31, 12, 44, 31),
   461              'rec': None,
   462          },
   463          {
   464              'number': 4,
   465              'string': u'привет',
   466              'time': datetime.time(12, 44, 31),
   467              'datetime': datetime.datetime(2018, 12, 31, 12, 44, 31),
   468              'rec': {
   469                  'rec_datetime': datetime.datetime(2018, 12, 31, 12, 44, 31),
   470                  'rec_rec': {
   471                      'rec_rec_datetime': datetime.datetime(
   472                          2018, 12, 31, 12, 44, 31)
   473                  }
   474              },
   475          }
   476      ]
   477      with beam.Pipeline(argv=self.args) as p:
   478        result = (
   479            p | 'Read with BigQuery Storage API' >> beam.io.ReadFromBigQuery(
   480                method=beam.io.ReadFromBigQuery.Method.DIRECT_READ,
   481                table=self.temp_table_reference,
   482                use_native_datetime=True))
   483        assert_that(result, equal_to(EXPECTED_TABLE_DATA))
   484  
   485    @pytest.mark.it_postcommit
   486    def test_iobase_source_with_column_selection(self):
   487      EXPECTED_TABLE_DATA = [{'number': 1}, {'number': 4}]
   488      with beam.Pipeline(argv=self.args) as p:
   489        result = (
   490            p | 'Read with BigQuery Storage API' >> beam.io.ReadFromBigQuery(
   491                method=beam.io.ReadFromBigQuery.Method.DIRECT_READ,
   492                table=self.temp_table_reference,
   493                selected_fields=['number']))
   494        assert_that(result, equal_to(EXPECTED_TABLE_DATA))
   495  
   496    @pytest.mark.it_postcommit
   497    def test_iobase_source_with_row_restriction(self):
   498      EXPECTED_TABLE_DATA = [{
   499          'number': 1,
   500          'string': u'你好',
   501          'time': datetime.time(12, 44, 31),
   502          'datetime': datetime.datetime(2018, 12, 31, 12, 44, 31),
   503          'rec': None
   504      }]
   505      with beam.Pipeline(argv=self.args) as p:
   506        result = (
   507            p | 'Read with BigQuery Storage API' >> beam.io.ReadFromBigQuery(
   508                method=beam.io.ReadFromBigQuery.Method.DIRECT_READ,
   509                table=self.temp_table_reference,
   510                row_restriction='number < 2',
   511                use_native_datetime=True))
   512        assert_that(result, equal_to(EXPECTED_TABLE_DATA))
   513  
   514    @pytest.mark.it_postcommit
   515    def test_iobase_source_with_column_selection_and_row_restriction(self):
   516      EXPECTED_TABLE_DATA = [{'string': u'привет'}]
   517      with beam.Pipeline(argv=self.args) as p:
   518        result = (
   519            p | 'Read with BigQuery Storage API' >> beam.io.ReadFromBigQuery(
   520                method=beam.io.ReadFromBigQuery.Method.DIRECT_READ,
   521                table=self.temp_table_reference,
   522                row_restriction='number > 2',
   523                selected_fields=['string']))
   524        assert_that(result, equal_to(EXPECTED_TABLE_DATA))
   525  
   526    @pytest.mark.it_postcommit
   527    def test_iobase_source_with_very_selective_filters(self):
   528      with beam.Pipeline(argv=self.args) as p:
   529        result = (
   530            p | 'Read with BigQuery Storage API' >> beam.io.ReadFromBigQuery(
   531                method=beam.io.ReadFromBigQuery.Method.DIRECT_READ,
   532                project=self.temp_table_reference.projectId,
   533                dataset=self.temp_table_reference.datasetId,
   534                table=self.temp_table_reference.tableId,
   535                row_restriction='number > 4',
   536                selected_fields=['string']))
   537        assert_that(result, equal_to([]))
   538  
   539    @pytest.mark.it_postcommit
   540    def test_iobase_source_with_query(self):
   541      EXPECTED_TABLE_DATA = [
   542          {
   543              'number': 1,
   544              'string': u'你好',
   545              'time': datetime.time(12, 44, 31),
   546              'datetime': datetime.datetime(2018, 12, 31, 12, 44, 31),
   547              'rec': None,
   548          },
   549          {
   550              'number': 4,
   551              'string': u'привет',
   552              'time': datetime.time(12, 44, 31),
   553              'datetime': datetime.datetime(2018, 12, 31, 12, 44, 31),
   554              'rec': {
   555                  'rec_datetime': datetime.datetime(2018, 12, 31, 12, 44, 31),
   556                  'rec_rec': {
   557                      'rec_rec_datetime': datetime.datetime(
   558                          2018, 12, 31, 12, 44, 31)
   559                  }
   560              },
   561          }
   562      ]
   563      query = StaticValueProvider(str, self.query)
   564      with beam.Pipeline(argv=self.args) as p:
   565        result = (
   566            p | 'Direct read with query' >> beam.io.ReadFromBigQuery(
   567                method=beam.io.ReadFromBigQuery.Method.DIRECT_READ,
   568                use_native_datetime=True,
   569                use_standard_sql=True,
   570                project=self.project,
   571                query=query))
   572        assert_that(result, equal_to(EXPECTED_TABLE_DATA))
   573  
   574    @pytest.mark.it_postcommit
   575    def test_iobase_source_with_query_and_filters(self):
   576      EXPECTED_TABLE_DATA = [{'string': u'привет'}]
   577      query = StaticValueProvider(str, self.query)
   578      with beam.Pipeline(argv=self.args) as p:
   579        result = (
   580            p | 'Direct read with query' >> beam.io.ReadFromBigQuery(
   581                method=beam.io.ReadFromBigQuery.Method.DIRECT_READ,
   582                row_restriction='number > 2',
   583                selected_fields=['string'],
   584                use_standard_sql=True,
   585                project=self.project,
   586                query=query))
   587        assert_that(result, equal_to(EXPECTED_TABLE_DATA))
   588  
   589  
   590  class ReadNewTypesTests(BigQueryReadIntegrationTests):
   591    @classmethod
   592    def setUpClass(cls):
   593      super(ReadNewTypesTests, cls).setUpClass()
   594      cls.table_name = 'python_new_types'
   595      cls.create_table(cls.table_name)
   596  
   597      table_id = '{}.{}'.format(cls.dataset_id, cls.table_name)
   598      cls.query = 'SELECT float, numeric, bytes, date, time, datetime,' \
   599                  'timestamp, geo FROM `%s`' % table_id
   600  
   601    @classmethod
   602    def create_table(cls, table_name):
   603      table_schema = bigquery.TableSchema()
   604      table_field = bigquery.TableFieldSchema()
   605      table_field.name = 'float'
   606      table_field.type = 'FLOAT'
   607      table_schema.fields.append(table_field)
   608      table_field = bigquery.TableFieldSchema()
   609      table_field.name = 'numeric'
   610      table_field.type = 'NUMERIC'
   611      table_schema.fields.append(table_field)
   612      table_field = bigquery.TableFieldSchema()
   613      table_field.name = 'bytes'
   614      table_field.type = 'BYTES'
   615      table_schema.fields.append(table_field)
   616      table_field = bigquery.TableFieldSchema()
   617      table_field.name = 'date'
   618      table_field.type = 'DATE'
   619      table_schema.fields.append(table_field)
   620      table_field = bigquery.TableFieldSchema()
   621      table_field.name = 'time'
   622      table_field.type = 'TIME'
   623      table_schema.fields.append(table_field)
   624      table_field = bigquery.TableFieldSchema()
   625      table_field.name = 'datetime'
   626      table_field.type = 'DATETIME'
   627      table_schema.fields.append(table_field)
   628      table_field = bigquery.TableFieldSchema()
   629      table_field.name = 'timestamp'
   630      table_field.type = 'TIMESTAMP'
   631      table_schema.fields.append(table_field)
   632      table_field = bigquery.TableFieldSchema()
   633      table_field.name = 'geo'
   634      table_field.type = 'GEOGRAPHY'
   635      table_schema.fields.append(table_field)
   636      table = bigquery.Table(
   637          tableReference=bigquery.TableReference(
   638              projectId=cls.project, datasetId=cls.dataset_id,
   639              tableId=table_name),
   640          schema=table_schema)
   641      request = bigquery.BigqueryTablesInsertRequest(
   642          projectId=cls.project, datasetId=cls.dataset_id, table=table)
   643      cls.bigquery_client.client.tables.Insert(request)
   644      row_data = {
   645          'float': 0.33,
   646          'numeric': Decimal('10'),
   647          'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'),
   648          'date': '3000-12-31',
   649          'time': '23:59:59',
   650          'datetime': '2018-12-31T12:44:31',
   651          'timestamp': '2018-12-31 12:44:31.744957 UTC',
   652          'geo': 'POINT(30 10)'
   653      }
   654  
   655      table_data = [row_data]
   656      # add rows with only one key value pair and None values for all other keys
   657      for key, value in row_data.items():
   658        table_data.append({key: value})
   659  
   660      cls.bigquery_client.insert_rows(
   661          cls.project, cls.dataset_id, table_name, table_data)
   662  
   663    def get_expected_data(self, native=True):
   664      byts = b'\xab\xac'
   665      expected_row = {
   666          'float': 0.33,
   667          'numeric': Decimal('10'),
   668          'bytes': base64.b64encode(byts) if native else byts,
   669          'date': '3000-12-31',
   670          'time': '23:59:59',
   671          'datetime': '2018-12-31T12:44:31',
   672          'timestamp': '2018-12-31 12:44:31.744957 UTC',
   673          'geo': 'POINT(30 10)'
   674      }
   675  
   676      expected_data = [expected_row]
   677  
   678      # add rows with only one key value pair and None values for all other keys
   679      for key, value in expected_row.items():
   680        row = {k: None for k in expected_row}
   681        row[key] = value
   682        expected_data.append(row)
   683  
   684      return expected_data
   685  
   686    @skip(['PortableRunner', 'FlinkRunner'])
   687    @pytest.mark.it_postcommit
   688    def test_native_source(self):
   689      with beam.Pipeline(argv=self.args) as p:
   690        result = (
   691            p
   692            | 'read' >> beam.io.Read(
   693                beam.io.BigQuerySource(query=self.query, use_standard_sql=True)))
   694        assert_that(result, equal_to(self.get_expected_data()))
   695  
   696    @pytest.mark.it_postcommit
   697    def test_iobase_source(self):
   698      with beam.Pipeline(argv=self.args) as p:
   699        result = (
   700            p
   701            | 'read' >> beam.io.ReadFromBigQuery(
   702                query=self.query,
   703                use_standard_sql=True,
   704                project=self.project,
   705                bigquery_job_labels={'launcher': 'apache_beam_tests'})
   706            | beam.Map(datetime_to_utc))
   707        assert_that(result, equal_to(self.get_expected_data(native=False)))
   708  
   709  
   710  class ReadAllBQTests(BigQueryReadIntegrationTests):
   711    TABLE_DATA_1 = [{
   712        'number': 1, 'str': 'abc'
   713    }, {
   714        'number': 2, 'str': 'def'
   715    }, {
   716        'number': 3, 'str': u'你好'
   717    }, {
   718        'number': 4, 'str': u'привет'
   719    }]
   720  
   721    TABLE_DATA_2 = [{
   722        'number': 10, 'str': 'abcd'
   723    }, {
   724        'number': 20, 'str': 'defg'
   725    }, {
   726        'number': 30, 'str': u'你好'
   727    }, {
   728        'number': 40, 'str': u'привет'
   729    }]
   730  
   731    TABLE_DATA_3 = [{'number': 10, 'str': 'abcde', 'extra': 3}]
   732  
   733    @classmethod
   734    def setUpClass(cls):
   735      super(ReadAllBQTests, cls).setUpClass()
   736      cls.SCHEMA_BQ = cls.create_bq_schema()
   737      cls.SCHEMA_BQ_WITH_EXTRA = cls.create_bq_schema(True)
   738  
   739      cls.table_name1 = 'python_rd_table_1'
   740      cls.table_schema1 = cls.create_table(
   741          cls.table_name1, cls.TABLE_DATA_1, cls.SCHEMA_BQ)
   742      table_id1 = '{}.{}'.format(cls.dataset_id, cls.table_name1)
   743      cls.query1 = 'SELECT number, str FROM `%s`' % table_id1
   744  
   745      cls.table_name2 = 'python_rd_table_2'
   746      cls.table_schema2 = cls.create_table(
   747          cls.table_name2, cls.TABLE_DATA_2, cls.SCHEMA_BQ)
   748      table_id2 = '{}.{}'.format(cls.dataset_id, cls.table_name2)
   749      cls.query2 = 'SELECT number, str FROM %s' % table_id2
   750  
   751      cls.table_name3 = 'python_rd_table_3'
   752      cls.table_schema3 = cls.create_table(
   753          cls.table_name3, cls.TABLE_DATA_3, cls.SCHEMA_BQ_WITH_EXTRA)
   754      table_id3 = '{}.{}'.format(cls.dataset_id, cls.table_name3)
   755      cls.query3 = 'SELECT number, str, extra FROM `%s`' % table_id3
   756  
   757    @classmethod
   758    def create_table(cls, table_name, data, table_schema):
   759      table = bigquery.Table(
   760          tableReference=bigquery.TableReference(
   761              projectId=cls.project, datasetId=cls.dataset_id,
   762              tableId=table_name),
   763          schema=table_schema)
   764      request = bigquery.BigqueryTablesInsertRequest(
   765          projectId=cls.project, datasetId=cls.dataset_id, table=table)
   766      cls.bigquery_client.client.tables.Insert(request)
   767      cls.bigquery_client.insert_rows(
   768          cls.project, cls.dataset_id, table_name, data)
   769      return table_schema
   770  
   771    @classmethod
   772    def create_bq_schema(cls, with_extra=False):
   773      table_schema = bigquery.TableSchema()
   774      table_field = bigquery.TableFieldSchema()
   775      table_field.name = 'number'
   776      table_field.type = 'INTEGER'
   777      table_field.mode = 'NULLABLE'
   778      table_schema.fields.append(table_field)
   779      table_field = bigquery.TableFieldSchema()
   780      table_field.name = 'str'
   781      table_field.type = 'STRING'
   782      table_field.mode = 'NULLABLE'
   783      table_schema.fields.append(table_field)
   784      if with_extra:
   785        table_field = bigquery.TableFieldSchema()
   786        table_field.name = 'extra'
   787        table_field.type = 'INTEGER'
   788        table_field.mode = 'NULLABLE'
   789        table_schema.fields.append(table_field)
   790      return table_schema
   791  
   792    @skip(['PortableRunner', 'FlinkRunner'])
   793    @pytest.mark.it_postcommit
   794    def test_read_queries(self):
   795      # TODO(https://github.com/apache/beam/issues/20610): Remove experiment when
   796      # tests run on r_v2.
   797      args = self.args + ["--experiments=use_runner_v2"]
   798      with beam.Pipeline(argv=args) as p:
   799        result = (
   800            p
   801            | beam.Create([
   802                beam.io.ReadFromBigQueryRequest(query=self.query1),
   803                beam.io.ReadFromBigQueryRequest(
   804                    query=self.query2, use_standard_sql=False),
   805                beam.io.ReadFromBigQueryRequest(
   806                    table='%s.%s' % (self.dataset_id, self.table_name3))
   807            ])
   808            | beam.io.ReadAllFromBigQuery())
   809        assert_that(
   810            result,
   811            equal_to(self.TABLE_DATA_1 + self.TABLE_DATA_2 + self.TABLE_DATA_3))
   812  
   813  
   814  class ReadInteractiveRunnerTests(BigQueryReadIntegrationTests):
   815    @skip(['PortableRunner', 'FlinkRunner'])
   816    @pytest.mark.it_postcommit
   817    def test_read_in_interactive_runner(self):
   818      p = beam.Pipeline(InteractiveRunner(), argv=self.args)
   819      pcoll = p | beam.io.ReadFromBigQuery(query="SELECT 1")
   820      result = interactive_beam.collect(pcoll)
   821      assert result.iloc[0, 0] == 1
   822  
   823  
   824  if __name__ == '__main__':
   825    logging.getLogger().setLevel(logging.INFO)
   826    unittest.main()