github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_write_it_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_write_it_test.py (about)

     1  #!/usr/bin/env python
     2  # -*- coding: utf-8 -*-
     3  #
     4  # Licensed to the Apache Software Foundation (ASF) under one or more
     5  # contributor license agreements.  See the NOTICE file distributed with
     6  # this work for additional information regarding copyright ownership.
     7  # The ASF licenses this file to You under the Apache License, Version 2.0
     8  # (the "License"); you may not use this file except in compliance with
     9  # the License.  You may obtain a copy of the License at
    10  #
    11  #    http://www.apache.org/licenses/LICENSE-2.0
    12  #
    13  # Unless required by applicable law or agreed to in writing, software
    14  # distributed under the License is distributed on an "AS IS" BASIS,
    15  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16  # See the License for the specific language governing permissions and
    17  # limitations under the License.
    18  #
    19  
    20  """Unit tests for BigQuery sources and sinks."""
    21  # pytype: skip-file
    22  
    23  import base64
    24  import datetime
    25  import logging
    26  import secrets
    27  import time
    28  import unittest
    29  from decimal import Decimal
    30  
    31  import hamcrest as hc
    32  import mock
    33  import pytest
    34  import pytz
    35  from parameterized import param
    36  from parameterized import parameterized
    37  
    38  import apache_beam as beam
    39  from apache_beam.io.gcp.bigquery import BigQueryWriteFn
    40  from apache_beam.io.gcp.bigquery_tools import BigQueryWrapper
    41  from apache_beam.io.gcp.bigquery_tools import FileFormat
    42  from apache_beam.io.gcp.internal.clients import bigquery
    43  from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultMatcher
    44  from apache_beam.testing.test_pipeline import TestPipeline
    45  from apache_beam.testing.util import assert_that
    46  from apache_beam.testing.util import equal_to
    47  
    48  # Protect against environments where bigquery library is not available.
    49  # pylint: disable=wrong-import-order, wrong-import-position
    50  
    51  try:
    52    from apitools.base.py.exceptions import HttpError
    53  except ImportError:
    54    HttpError = None
    55  # pylint: enable=wrong-import-order, wrong-import-position
    56  
    57  _LOGGER = logging.getLogger(__name__)
    58  
    59  
    60  class BigQueryWriteIntegrationTests(unittest.TestCase):
    61    BIG_QUERY_DATASET_ID = 'python_write_to_table_'
    62  
    63    def setUp(self):
    64      self.test_pipeline = TestPipeline(is_integration_test=True)
    65      self.runner_name = type(self.test_pipeline.runner).__name__
    66      self.project = self.test_pipeline.get_option('project')
    67  
    68      self.bigquery_client = BigQueryWrapper()
    69      self.dataset_id = '%s%d%s' % (
    70          self.BIG_QUERY_DATASET_ID, int(time.time()), secrets.token_hex(3))
    71      self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    72      _LOGGER.info(
    73          "Created dataset %s in project %s", self.dataset_id, self.project)
    74  
    75    def tearDown(self):
    76      request = bigquery.BigqueryDatasetsDeleteRequest(
    77          projectId=self.project, datasetId=self.dataset_id, deleteContents=True)
    78      try:
    79        _LOGGER.info(
    80            "Deleting dataset %s in project %s", self.dataset_id, self.project)
    81        self.bigquery_client.client.datasets.Delete(request)
    82      except HttpError:
    83        _LOGGER.debug(
    84            'Failed to clean up dataset %s in project %s',
    85            self.dataset_id,
    86            self.project)
    87  
    88    def create_table(self, table_name):
    89      table_schema = bigquery.TableSchema()
    90      table_field = bigquery.TableFieldSchema()
    91      table_field.name = 'int64'
    92      table_field.type = 'INT64'
    93      table_field.mode = 'REQUIRED'
    94      table_schema.fields.append(table_field)
    95      table_field = bigquery.TableFieldSchema()
    96      table_field.name = 'bytes'
    97      table_field.type = 'BYTES'
    98      table_schema.fields.append(table_field)
    99      table_field = bigquery.TableFieldSchema()
   100      table_field.name = 'date'
   101      table_field.type = 'DATE'
   102      table_schema.fields.append(table_field)
   103      table_field = bigquery.TableFieldSchema()
   104      table_field.name = 'time'
   105      table_field.type = 'TIME'
   106      table_schema.fields.append(table_field)
   107      table = bigquery.Table(
   108          tableReference=bigquery.TableReference(
   109              projectId=self.project,
   110              datasetId=self.dataset_id,
   111              tableId=table_name),
   112          schema=table_schema)
   113      request = bigquery.BigqueryTablesInsertRequest(
   114          projectId=self.project, datasetId=self.dataset_id, table=table)
   115      self.bigquery_client.client.tables.Insert(request)
   116  
   117    @pytest.mark.it_postcommit
   118    def test_big_query_write(self):
   119      table_name = 'python_write_table'
   120      table_id = '{}.{}'.format(self.dataset_id, table_name)
   121  
   122      input_data = [
   123          {
   124              'number': 1, 'str': 'abc'
   125          },
   126          {
   127              'number': 2, 'str': 'def'
   128          },
   129          {
   130              'number': 3, 'str': u'你好'
   131          },
   132          {
   133              'number': 4, 'str': u'привет'
   134          },
   135      ]
   136      table_schema = {
   137          "fields": [{
   138              "name": "number", "type": "INTEGER"
   139          }, {
   140              "name": "str", "type": "STRING"
   141          }]
   142      }
   143  
   144      pipeline_verifiers = [
   145          BigqueryFullResultMatcher(
   146              project=self.project,
   147              query="SELECT number, str FROM %s" % table_id,
   148              data=[(
   149                  1,
   150                  'abc',
   151              ), (
   152                  2,
   153                  'def',
   154              ), (
   155                  3,
   156                  u'你好',
   157              ), (
   158                  4,
   159                  u'привет',
   160              )])
   161      ]
   162  
   163      args = self.test_pipeline.get_full_options_as_args(
   164          on_success_matcher=hc.all_of(*pipeline_verifiers))
   165  
   166      with beam.Pipeline(argv=args) as p:
   167        # pylint: disable=expression-not-assigned
   168        (
   169            p | 'create' >> beam.Create(input_data)
   170            | 'write' >> beam.io.WriteToBigQuery(
   171                table_id,
   172                schema=table_schema,
   173                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
   174                write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
   175  
   176    @pytest.mark.it_postcommit
   177    def test_big_query_write_schema_autodetect(self):
   178      if self.runner_name == 'TestDataflowRunner':
   179        self.skipTest('DataflowRunner does not support schema autodetection')
   180  
   181      table_name = 'python_write_table'
   182      table_id = '{}.{}'.format(self.dataset_id, table_name)
   183  
   184      input_data = [
   185          {
   186              'number': 1, 'str': 'abc'
   187          },
   188          {
   189              'number': 2, 'str': 'def'
   190          },
   191      ]
   192  
   193      pipeline_verifiers = [
   194          BigqueryFullResultMatcher(
   195              project=self.project,
   196              query="SELECT number, str FROM %s" % table_id,
   197              data=[(
   198                  1,
   199                  'abc',
   200              ), (
   201                  2,
   202                  'def',
   203              )])
   204      ]
   205  
   206      args = self.test_pipeline.get_full_options_as_args(
   207          on_success_matcher=hc.all_of(*pipeline_verifiers))
   208  
   209      with beam.Pipeline(argv=args) as p:
   210        # pylint: disable=expression-not-assigned
   211        (
   212            p | 'create' >> beam.Create(input_data)
   213            | 'write' >> beam.io.WriteToBigQuery(
   214                table_id,
   215                method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
   216                schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT,
   217                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
   218                write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
   219                temp_file_format=FileFormat.JSON))
   220  
   221    @pytest.mark.it_postcommit
   222    def test_big_query_write_new_types(self):
   223      table_name = 'python_new_types_table'
   224      table_id = '{}.{}'.format(self.dataset_id, table_name)
   225  
   226      row_data = {
   227          'float': 0.33,
   228          'numeric': Decimal('10'),
   229          'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'),
   230          'date': '3000-12-31',
   231          'time': '23:59:59',
   232          'datetime': '2018-12-31T12:44:31',
   233          'timestamp': '2018-12-31 12:44:31.744957 UTC',
   234          'geo': 'POINT(30 10)'
   235      }
   236  
   237      input_data = [row_data]
   238      # add rows with only one key value pair and None values for all other keys
   239      for key, value in row_data.items():
   240        input_data.append({key: value})
   241  
   242      table_schema = {
   243          "fields": [{
   244              "name": "float", "type": "FLOAT"
   245          }, {
   246              "name": "numeric", "type": "NUMERIC"
   247          }, {
   248              "name": "bytes", "type": "BYTES"
   249          }, {
   250              "name": "date", "type": "DATE"
   251          }, {
   252              "name": "time", "type": "TIME"
   253          }, {
   254              "name": "datetime", "type": "DATETIME"
   255          }, {
   256              "name": "timestamp", "type": "TIMESTAMP"
   257          }, {
   258              "name": "geo", "type": "GEOGRAPHY"
   259          }]
   260      }
   261  
   262      expected_row = (
   263          0.33,
   264          Decimal('10'),
   265          b'\xab\xac',
   266          datetime.date(3000, 12, 31),
   267          datetime.time(23, 59, 59),
   268          datetime.datetime(2018, 12, 31, 12, 44, 31),
   269          datetime.datetime(2018, 12, 31, 12, 44, 31, 744957, tzinfo=pytz.utc),
   270          'POINT(30 10)',
   271      )
   272  
   273      expected_data = [expected_row]
   274  
   275      # add rows with only one key value pair and None values for all other keys
   276      for i, value in enumerate(expected_row):
   277        row = [None] * len(expected_row)
   278        row[i] = value
   279        expected_data.append(tuple(row))
   280  
   281      pipeline_verifiers = [
   282          BigqueryFullResultMatcher(
   283              project=self.project,
   284              query='SELECT float, numeric, bytes, date, time, datetime,'
   285              'timestamp, geo FROM %s' % table_id,
   286              data=expected_data)
   287      ]
   288  
   289      args = self.test_pipeline.get_full_options_as_args(
   290          on_success_matcher=hc.all_of(*pipeline_verifiers))
   291  
   292      with beam.Pipeline(argv=args) as p:
   293        # pylint: disable=expression-not-assigned
   294        (
   295            p | 'create' >> beam.Create(input_data)
   296            | 'write' >> beam.io.WriteToBigQuery(
   297                table_id,
   298                schema=table_schema,
   299                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
   300                write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
   301  
   302    @pytest.mark.it_postcommit
   303    def test_big_query_write_without_schema(self):
   304      table_name = 'python_no_schema_table'
   305      self.create_table(table_name)
   306      table_id = '{}.{}'.format(self.dataset_id, table_name)
   307  
   308      input_data = [{
   309          'int64': 1,
   310          'bytes': b'xyw',
   311          'date': '2011-01-01',
   312          'time': '23:59:59.999999'
   313      },
   314                    {
   315                        'int64': 2,
   316                        'bytes': b'abc',
   317                        'date': '2000-01-01',
   318                        'time': '00:00:00'
   319                    },
   320                    {
   321                        'int64': 3,
   322                        'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
   323                        'date': '3000-12-31',
   324                        'time': '23:59:59'
   325                    },
   326                    {
   327                        'int64': 4,
   328                        'bytes': b'\xab\xac\xad',
   329                        'date': '2000-01-01',
   330                        'time': '00:00:00'
   331                    }]
   332      # bigquery io expects bytes to be base64 encoded values
   333      for row in input_data:
   334        row['bytes'] = base64.b64encode(row['bytes'])
   335  
   336      pipeline_verifiers = [
   337          BigqueryFullResultMatcher(
   338              project=self.project,
   339              query="SELECT int64, bytes, date, time FROM %s" % table_id,
   340              data=[(
   341                  1,
   342                  b'xyw',
   343                  datetime.date(2011, 1, 1),
   344                  datetime.time(23, 59, 59, 999999),
   345              ),
   346                    (
   347                        2,
   348                        b'abc',
   349                        datetime.date(2000, 1, 1),
   350                        datetime.time(0, 0, 0),
   351                    ),
   352                    (
   353                        3,
   354                        b'\xe4\xbd\xa0\xe5\xa5\xbd',
   355                        datetime.date(3000, 12, 31),
   356                        datetime.time(23, 59, 59),
   357                    ),
   358                    (
   359                        4,
   360                        b'\xab\xac\xad',
   361                        datetime.date(2000, 1, 1),
   362                        datetime.time(0, 0, 0),
   363                    )])
   364      ]
   365  
   366      args = self.test_pipeline.get_full_options_as_args(
   367          on_success_matcher=hc.all_of(*pipeline_verifiers))
   368  
   369      with beam.Pipeline(argv=args) as p:
   370        # pylint: disable=expression-not-assigned
   371        (
   372            p | 'create' >> beam.Create(input_data)
   373            | 'write' >> beam.io.WriteToBigQuery(
   374                table_id,
   375                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
   376                temp_file_format=FileFormat.JSON))
   377  
   378    @pytest.mark.it_postcommit
   379    def test_big_query_write_insert_errors_reporting(self):
   380      """
   381      Test that errors returned by beam.io.WriteToBigQuery
   382      contain both the failed rows amd the reason for it failing.
   383      """
   384      table_name = 'python_write_table'
   385      table_id = '{}.{}'.format(self.dataset_id, table_name)
   386  
   387      input_data = [{
   388          'number': 1,
   389          'str': 'some_string',
   390      }, {
   391          'number': 2
   392      },
   393                    {
   394                        'number': 3,
   395                        'str': 'some_string',
   396                        'additional_field_str': 'some_string',
   397                    }]
   398  
   399      table_schema = {
   400          "fields": [{
   401              "name": "number", "type": "INTEGER", 'mode': 'REQUIRED'
   402          }, {
   403              "name": "str", "type": "STRING", 'mode': 'REQUIRED'
   404          }]
   405      }
   406  
   407      bq_result_errors = [(
   408          {
   409              "number": 2
   410          },
   411          [{
   412              "reason": "invalid",
   413              "location": "",
   414              "debugInfo": "",
   415              "message": "Missing required field: Msg_0_CLOUD_QUERY_TABLE.str."
   416          }],
   417      ),
   418                          ({
   419                              "number": 3,
   420                              "str": "some_string",
   421                              "additional_field_str": "some_string"
   422                          },
   423                           [{
   424                               "reason": "invalid",
   425                               "location": "additional_field_str",
   426                               "debugInfo": "",
   427                               "message": "no such field: additional_field_str."
   428                           }])]
   429  
   430      pipeline_verifiers = [
   431          BigqueryFullResultMatcher(
   432              project=self.project,
   433              query="SELECT number, str FROM %s" % table_id,
   434              data=[(1, 'some_string')]),
   435      ]
   436  
   437      args = self.test_pipeline.get_full_options_as_args(
   438          on_success_matcher=hc.all_of(*pipeline_verifiers))
   439  
   440      with beam.Pipeline(argv=args) as p:
   441        # pylint: disable=expression-not-assigned
   442        errors = (
   443            p | 'create' >> beam.Create(input_data)
   444            | 'write' >> beam.io.WriteToBigQuery(
   445                table_id,
   446                schema=table_schema,
   447                method='STREAMING_INSERTS',
   448                insert_retry_strategy='RETRY_NEVER',
   449                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
   450                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
   451  
   452        assert_that(
   453            errors[BigQueryWriteFn.FAILED_ROWS_WITH_ERRORS]
   454            | 'ParseErrors' >> beam.Map(lambda err: (err[1], err[2])),
   455            equal_to(bq_result_errors))
   456  
   457    @pytest.mark.it_postcommit
   458    @parameterized.expand([
   459        param(file_format=FileFormat.AVRO),
   460        param(file_format=FileFormat.JSON),
   461        param(file_format=None),
   462    ])
   463    @mock.patch(
   464        "apache_beam.io.gcp.bigquery_file_loads._MAXIMUM_SOURCE_URIS", new=1)
   465    def test_big_query_write_temp_table_append_schema_update(self, file_format):
   466      """
   467      Test that nested schema update options and schema relaxation
   468      are respected when appending to an existing table via temporary tables.
   469  
   470      _MAXIMUM_SOURCE_URIS and max_file_size are both set to 1 to force multiple
   471      load jobs and usage of temporary tables.
   472      """
   473      table_name = 'python_append_schema_update'
   474      self.create_table(table_name)
   475      table_id = '{}.{}'.format(self.dataset_id, table_name)
   476  
   477      # bytes, date, time fields are optional and omitted in the test
   478      # only required and new columns are specified
   479      table_schema = {
   480          "fields": [{
   481              "name": "int64",
   482              "type": "INT64",
   483              "mode": "NULLABLE",
   484          }, {
   485              "name": "bool",
   486              "type": "BOOL",
   487          },
   488                     {
   489                         "name": "nested_field",
   490                         "type": "RECORD",
   491                         "mode": "REPEATED",
   492                         "fields": [
   493                             {
   494                                 "name": "fruit",
   495                                 "type": "STRING",
   496                                 "mode": "NULLABLE"
   497                             },
   498                         ]
   499                     }]
   500      }
   501      input_data = [{
   502          "int64": 1, "bool": True, "nested_field": [{
   503              "fruit": "Apple"
   504          }]
   505      }, {
   506          "bool": False, "nested_field": [{
   507              "fruit": "Mango"
   508          }]
   509      },
   510                    {
   511                        "int64": None,
   512                        "bool": True,
   513                        "nested_field": [{
   514                            "fruit": "Banana"
   515                        }]
   516                    }]
   517      args = self.test_pipeline.get_full_options_as_args(
   518          on_success_matcher=BigqueryFullResultMatcher(
   519              project=self.project,
   520              query="""
   521              SELECT bytes, date, time, int64, bool, fruit
   522              FROM {},
   523              UNNEST(nested_field) as nested_field
   524              ORDER BY fruit
   525              """.format(table_id),
   526              data=[(None, None, None, 1, True,
   527                     "Apple"), (None, None, None, None, True, "Banana"), (
   528                         None, None, None, None, False, "Mango")]))
   529  
   530      with beam.Pipeline(argv=args) as p:
   531        # pylint: disable=expression-not-assigned
   532        (
   533            p | 'create' >> beam.Create(input_data)
   534            | 'write' >> beam.io.WriteToBigQuery(
   535                table_id,
   536                schema=table_schema,
   537                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
   538                max_file_size=1,  # bytes
   539                method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
   540                additional_bq_parameters={
   541                    'schemaUpdateOptions': ['ALLOW_FIELD_ADDITION',
   542                                            'ALLOW_FIELD_RELAXATION']},
   543                temp_file_format=file_format))
   544  
   545  
   546  if __name__ == '__main__':
   547    logging.getLogger().setLevel(logging.INFO)
   548    unittest.main()