github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_file_loads_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_file_loads_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Unit tests for BigQuery file loads utilities."""
    19  
    20  # pytype: skip-file
    21  
    22  import logging
    23  import os
    24  import secrets
    25  import time
    26  import unittest
    27  
    28  import mock
    29  import pytest
    30  from hamcrest.core import assert_that as hamcrest_assert
    31  from hamcrest.core.core.allof import all_of
    32  from hamcrest.core.core.is_ import is_
    33  from parameterized import param
    34  from parameterized import parameterized
    35  
    36  import apache_beam as beam
    37  from apache_beam.io.filebasedsink_test import _TestCaseWithTempDirCleanUp
    38  from apache_beam.io.gcp import bigquery_file_loads as bqfl
    39  from apache_beam.io.gcp import bigquery
    40  from apache_beam.io.gcp import bigquery_tools
    41  from apache_beam.io.gcp.bigquery import BigQueryDisposition
    42  from apache_beam.io.gcp.internal.clients import bigquery as bigquery_api
    43  from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultMatcher
    44  from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultStreamingMatcher
    45  from apache_beam.options.pipeline_options import PipelineOptions
    46  from apache_beam.options.pipeline_options import StandardOptions
    47  from apache_beam.runners.dataflow.test_dataflow_runner import TestDataflowRunner
    48  from apache_beam.testing.test_pipeline import TestPipeline
    49  from apache_beam.testing.test_stream import TestStream
    50  from apache_beam.testing.util import assert_that
    51  from apache_beam.testing.util import equal_to
    52  from apache_beam.transforms import combiners
    53  from apache_beam.transforms.window import TimestampedValue
    54  from apache_beam.typehints.typehints import Tuple
    55  from apache_beam.utils import timestamp
    56  
    57  try:
    58    from apitools.base.py.exceptions import HttpError
    59  except ImportError:
    60    raise unittest.SkipTest('GCP dependencies are not installed')
    61  
    62  _LOGGER = logging.getLogger(__name__)
    63  
    64  _DESTINATION_ELEMENT_PAIRS = [
    65      # DESTINATION 1
    66      ('project1:dataset1.table1', {
    67          'name': 'beam', 'language': 'py'
    68      }),
    69      ('project1:dataset1.table1', {
    70          'name': 'beam', 'language': 'java'
    71      }),
    72      ('project1:dataset1.table1', {
    73          'name': 'beam', 'language': 'go'
    74      }),
    75      ('project1:dataset1.table1', {
    76          'name': 'flink', 'language': 'java'
    77      }),
    78      ('project1:dataset1.table1', {
    79          'name': 'flink', 'language': 'scala'
    80      }),
    81  
    82      # DESTINATION 3
    83      ('project1:dataset1.table3', {
    84          'name': 'spark', 'language': 'scala'
    85      }),
    86  
    87      # DESTINATION 1
    88      ('project1:dataset1.table1', {
    89          'name': 'spark', 'language': 'py'
    90      }),
    91      ('project1:dataset1.table1', {
    92          'name': 'spark', 'language': 'scala'
    93      }),
    94  
    95      # DESTINATION 2
    96      ('project1:dataset1.table2', {
    97          'name': 'beam', 'foundation': 'apache'
    98      }),
    99      ('project1:dataset1.table2', {
   100          'name': 'flink', 'foundation': 'apache'
   101      }),
   102      ('project1:dataset1.table2', {
   103          'name': 'spark', 'foundation': 'apache'
   104      }),
   105  ]
   106  
   107  _DISTINCT_DESTINATIONS = list({elm[0] for elm in _DESTINATION_ELEMENT_PAIRS})
   108  
   109  _ELEMENTS = [elm[1] for elm in _DESTINATION_ELEMENT_PAIRS]
   110  
   111  _ELEMENTS_SCHEMA = bigquery.WriteToBigQuery.get_dict_table_schema(
   112      bigquery_api.TableSchema(
   113          fields=[
   114              bigquery_api.TableFieldSchema(
   115                  name="name", type="STRING", mode="REQUIRED"),
   116              bigquery_api.TableFieldSchema(name="language", type="STRING"),
   117              bigquery_api.TableFieldSchema(name="foundation", type="STRING"),
   118          ]))
   119  
   120  
   121  class TestWriteRecordsToFile(_TestCaseWithTempDirCleanUp):
   122    maxDiff = None
   123  
   124    def _consume_input(self, fn, checks=None):
   125      if checks is None:
   126        return
   127  
   128      with TestPipeline() as p:
   129        output_pcs = (
   130            p
   131            | beam.Create(_DESTINATION_ELEMENT_PAIRS, reshuffle=False)
   132            | beam.ParDo(fn, self.tmpdir).with_outputs(
   133                fn.WRITTEN_FILE_TAG, fn.UNWRITTEN_RECORD_TAG))
   134  
   135        checks(output_pcs)
   136        return output_pcs
   137  
   138    @parameterized.expand([
   139        param(file_format=bigquery_tools.FileFormat.AVRO),
   140        param(file_format=bigquery_tools.FileFormat.JSON),
   141        param(file_format=None),
   142    ])
   143    def test_files_created(self, file_format):
   144      """Test that the files are created and written."""
   145  
   146      fn = bqfl.WriteRecordsToFile(
   147          schema=_ELEMENTS_SCHEMA, file_format=file_format)
   148      self.tmpdir = self._new_tempdir()
   149  
   150      def check_files_created(output_pcs):
   151        dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG]
   152  
   153        files = dest_file_pc | "GetFiles" >> beam.Map(lambda x: x[1][0])
   154        file_count = files | "CountFiles" >> combiners.Count.Globally()
   155  
   156        _ = files | "FilesExist" >> beam.Map(
   157            lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
   158        assert_that(file_count, equal_to([3]), label='check file count')
   159  
   160        destinations = (
   161            dest_file_pc
   162            | "GetDests" >>
   163            beam.Map(lambda x: bigquery_tools.get_hashable_destination(x[0])))
   164        assert_that(
   165            destinations,
   166            equal_to(list(_DISTINCT_DESTINATIONS)),
   167            label='check destinations ')
   168  
   169      self._consume_input(fn, check_files_created)
   170  
   171    def test_many_files(self):
   172      """Forces records to be written to many files.
   173  
   174      For each destination multiple files are necessary. This is because the max
   175      file length is very small, so only a couple records fit in each file.
   176      """
   177  
   178      fn = bqfl.WriteRecordsToFile(schema=_ELEMENTS_SCHEMA, max_file_size=50)
   179      self.tmpdir = self._new_tempdir()
   180  
   181      def check_many_files(output_pcs):
   182        dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG]
   183  
   184        files_per_dest = (
   185            dest_file_pc
   186            | beam.Map(lambda x: x).with_output_types(
   187                beam.typehints.KV[str, Tuple[str, int]])
   188            | combiners.Count.PerKey())
   189        files_per_dest = (
   190            files_per_dest
   191            | "GetDests" >> beam.Map(
   192                lambda x: (bigquery_tools.get_hashable_destination(x[0]), x[1])))
   193        assert_that(
   194            files_per_dest,
   195            equal_to([('project1:dataset1.table1', 4),
   196                      ('project1:dataset1.table2', 2),
   197                      ('project1:dataset1.table3', 1)]))
   198  
   199        # Check that the files exist
   200        _ = dest_file_pc | beam.Map(lambda x: x[1][0]) | beam.Map(
   201            lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
   202  
   203      self._consume_input(fn, check_many_files)
   204  
   205    @parameterized.expand([
   206        param(file_format=bigquery_tools.FileFormat.AVRO),
   207        param(file_format=bigquery_tools.FileFormat.JSON),
   208    ])
   209    def test_records_are_spilled(self, file_format):
   210      """Forces records to be written to many files.
   211  
   212      For each destination multiple files are necessary, and at most two files
   213      can be created. This forces records to be spilled to the next stage of
   214      processing.
   215      """
   216  
   217      fn = bqfl.WriteRecordsToFile(
   218          schema=_ELEMENTS_SCHEMA,
   219          max_files_per_bundle=2,
   220          file_format=file_format)
   221      self.tmpdir = self._new_tempdir()
   222  
   223      def check_many_files(output_pcs):
   224        dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG]
   225        spilled_records_pc = output_pcs[
   226            bqfl.WriteRecordsToFile.UNWRITTEN_RECORD_TAG]
   227  
   228        spilled_records_count = (spilled_records_pc | combiners.Count.Globally())
   229        assert_that(spilled_records_count, equal_to([3]), label='spilled count')
   230  
   231        files_per_dest = (
   232            dest_file_pc
   233            | beam.Map(lambda x: x).with_output_types(
   234                beam.typehints.KV[str, Tuple[str, int]])
   235            | combiners.Count.PerKey())
   236        files_per_dest = (
   237            files_per_dest
   238            | "GetDests" >> beam.Map(
   239                lambda x: (bigquery_tools.get_hashable_destination(x[0]), x[1])))
   240  
   241        # Only table1 and table3 get files. table2 records get spilled.
   242        assert_that(
   243            files_per_dest,
   244            equal_to([('project1:dataset1.table1', 1),
   245                      ('project1:dataset1.table3', 1)]),
   246            label='file count')
   247  
   248        # Check that the files exist
   249        _ = dest_file_pc | beam.Map(lambda x: x[1][0]) | beam.Map(
   250            lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
   251  
   252      self._consume_input(fn, check_many_files)
   253  
   254  
   255  class TestWriteGroupedRecordsToFile(_TestCaseWithTempDirCleanUp):
   256    def _consume_input(self, fn, input, checks):
   257      if checks is None:
   258        return
   259  
   260      with TestPipeline() as p:
   261        res = (
   262            p
   263            | beam.Create(input)
   264            | beam.GroupByKey()
   265            | beam.ParDo(fn, self.tmpdir))
   266  
   267        checks(res)
   268        return res
   269  
   270    @parameterized.expand([
   271        param(file_format=bigquery_tools.FileFormat.AVRO),
   272        param(file_format=bigquery_tools.FileFormat.JSON),
   273        param(file_format=None),
   274    ])
   275    def test_files_are_created(self, file_format):
   276      """Test that the files are created and written."""
   277  
   278      fn = bqfl.WriteGroupedRecordsToFile(
   279          schema=_ELEMENTS_SCHEMA, file_format=file_format)
   280      self.tmpdir = self._new_tempdir()
   281  
   282      def check_files_created(output_pc):
   283        files = output_pc | "GetFiles" >> beam.Map(lambda x: x[1][0])
   284        file_count = files | "CountFiles" >> combiners.Count.Globally()
   285  
   286        _ = files | "FilesExist" >> beam.Map(
   287            lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
   288        assert_that(file_count, equal_to([3]), label='check file count')
   289  
   290        destinations = (
   291            output_pc
   292            | "GetDests" >>
   293            beam.Map(lambda x: bigquery_tools.get_hashable_destination(x[0])))
   294        assert_that(
   295            destinations,
   296            equal_to(list(_DISTINCT_DESTINATIONS)),
   297            label='check destinations ')
   298  
   299      self._consume_input(fn, _DESTINATION_ELEMENT_PAIRS, check_files_created)
   300  
   301    def test_multiple_files(self):
   302      """Forces records to be written to many files.
   303  
   304      For each destination multiple files are necessary. This is because the max
   305      file length is very small, so only a couple records fit in each file.
   306      """
   307      fn = bqfl.WriteGroupedRecordsToFile(
   308          schema=_ELEMENTS_SCHEMA, max_file_size=50)
   309      self.tmpdir = self._new_tempdir()
   310  
   311      def check_multiple_files(output_pc):
   312        files_per_dest = output_pc | combiners.Count.PerKey()
   313        files_per_dest = (
   314            files_per_dest
   315            | "GetDests" >> beam.Map(
   316                lambda x: (bigquery_tools.get_hashable_destination(x[0]), x[1])))
   317        assert_that(
   318            files_per_dest,
   319            equal_to([
   320                ('project1:dataset1.table1', 4),
   321                ('project1:dataset1.table2', 2),
   322                ('project1:dataset1.table3', 1),
   323            ]))
   324  
   325        # Check that the files exist
   326        _ = output_pc | beam.Map(lambda x: x[1][0]) | beam.Map(os.path.exists)
   327  
   328      self._consume_input(fn, _DESTINATION_ELEMENT_PAIRS, check_multiple_files)
   329  
   330  
   331  class TestPartitionFiles(unittest.TestCase):
   332  
   333    _ELEMENTS = [(
   334        'destination0', [('file0', 50), ('file1', 50), ('file2', 50),
   335                         ('file3', 50)]),
   336                 ('destination1', [('file0', 50), ('file1', 50)])]
   337  
   338    def test_partition(self):
   339      partition = bqfl.PartitionFiles.Partition(1000, 1)
   340      self.assertEqual(partition.can_accept(50), True)
   341      self.assertEqual(partition.can_accept(2000), False)
   342      self.assertEqual(partition.can_accept(1000), True)
   343  
   344      partition.add('file1', 50)
   345      self.assertEqual(partition.files, ['file1'])
   346      self.assertEqual(partition.size, 50)
   347      self.assertEqual(partition.can_accept(50), False)
   348      self.assertEqual(partition.can_accept(0), False)
   349  
   350    def test_partition_files_dofn_file_split(self):
   351      """Force partitions to split based on max_files"""
   352      multiple_partitions_result = [('destination0', ['file0', 'file1']),
   353                                    ('destination0', ['file2', 'file3'])]
   354      single_partition_result = [('destination1', ['file0', 'file1'])]
   355      with TestPipeline() as p:
   356        destination_file_pairs = p | beam.Create(self._ELEMENTS, reshuffle=False)
   357        partitioned_files = (
   358            destination_file_pairs
   359            | beam.ParDo(bqfl.PartitionFiles(1000, 2)).with_outputs(
   360                bqfl.PartitionFiles.MULTIPLE_PARTITIONS_TAG,
   361                bqfl.PartitionFiles.SINGLE_PARTITION_TAG))
   362        multiple_partitions = partitioned_files[bqfl.PartitionFiles\
   363                                                .MULTIPLE_PARTITIONS_TAG]
   364        single_partition = partitioned_files[bqfl.PartitionFiles\
   365                                             .SINGLE_PARTITION_TAG]
   366  
   367      assert_that(
   368          multiple_partitions,
   369          equal_to(multiple_partitions_result),
   370          label='CheckMultiplePartitions')
   371      assert_that(
   372          single_partition,
   373          equal_to(single_partition_result),
   374          label='CheckSinglePartition')
   375  
   376    def test_partition_files_dofn_size_split(self):
   377      """Force partitions to split based on max_partition_size"""
   378      multiple_partitions_result = [('destination0', ['file0', 'file1', 'file2']),
   379                                    ('destination0', ['file3'])]
   380      single_partition_result = [('destination1', ['file0', 'file1'])]
   381      with TestPipeline() as p:
   382        destination_file_pairs = p | beam.Create(self._ELEMENTS, reshuffle=False)
   383        partitioned_files = (
   384            destination_file_pairs
   385            | beam.ParDo(bqfl.PartitionFiles(150, 10)).with_outputs(
   386                bqfl.PartitionFiles.MULTIPLE_PARTITIONS_TAG,
   387                bqfl.PartitionFiles.SINGLE_PARTITION_TAG))
   388        multiple_partitions = partitioned_files[bqfl.PartitionFiles\
   389                                                .MULTIPLE_PARTITIONS_TAG]
   390        single_partition = partitioned_files[bqfl.PartitionFiles\
   391                                             .SINGLE_PARTITION_TAG]
   392  
   393      assert_that(
   394          multiple_partitions,
   395          equal_to(multiple_partitions_result),
   396          label='CheckMultiplePartitions')
   397      assert_that(
   398          single_partition,
   399          equal_to(single_partition_result),
   400          label='CheckSinglePartition')
   401  
   402  
   403  class TestBigQueryFileLoads(_TestCaseWithTempDirCleanUp):
   404    def test_trigger_load_jobs_with_empty_files(self):
   405      destination = "project:dataset.table"
   406      empty_files = []
   407      load_job_prefix = "test_prefix"
   408  
   409      with beam.Pipeline() as p:
   410        partitions = (
   411            p
   412            | beam.Create([(destination, empty_files)])
   413            | beam.ParDo(bqfl.PartitionFiles(1000, 10)).with_outputs(
   414                bqfl.PartitionFiles.MULTIPLE_PARTITIONS_TAG,
   415                bqfl.PartitionFiles.SINGLE_PARTITION_TAG))
   416  
   417        _ = (
   418            partitions[bqfl.PartitionFiles.SINGLE_PARTITION_TAG]
   419            | beam.ParDo(bqfl.TriggerLoadJobs(), load_job_prefix))
   420  
   421    def test_records_traverse_transform_with_mocks(self):
   422      destination = 'project1:dataset1.table1'
   423  
   424      job_reference = bigquery_api.JobReference()
   425      job_reference.projectId = 'project1'
   426      job_reference.jobId = 'job_name1'
   427      result_job = bigquery_api.Job()
   428      result_job.jobReference = job_reference
   429  
   430      mock_job = mock.Mock()
   431      mock_job.status.state = 'DONE'
   432      mock_job.status.errorResult = None
   433      mock_job.jobReference = job_reference
   434  
   435      bq_client = mock.Mock()
   436      bq_client.jobs.Get.return_value = mock_job
   437  
   438      bq_client.jobs.Insert.return_value = result_job
   439  
   440      transform = bqfl.BigQueryBatchFileLoads(
   441          destination,
   442          custom_gcs_temp_location=self._new_tempdir(),
   443          test_client=bq_client,
   444          validate=False,
   445          temp_file_format=bigquery_tools.FileFormat.JSON)
   446  
   447      # Need to test this with the DirectRunner to avoid serializing mocks
   448      with TestPipeline('DirectRunner') as p:
   449        outputs = p | beam.Create(_ELEMENTS) | transform
   450  
   451        dest_files = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
   452        dest_job = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]
   453  
   454        jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1])
   455  
   456        files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1][0])
   457        destinations = (
   458            dest_files
   459            | "GetDests" >> beam.Map(
   460                lambda x: (bigquery_tools.get_hashable_destination(x[0]), x[1]))
   461            | "GetUniques" >> combiners.Count.PerKey()
   462            | "GetFinalDests" >> beam.Keys())
   463  
   464        # All files exist
   465        _ = (
   466            files
   467            | beam.Map(lambda x: hamcrest_assert(os.path.exists(x), is_(True))))
   468  
   469        # One file per destination
   470        assert_that(
   471            files | combiners.Count.Globally(), equal_to([1]), label='CountFiles')
   472  
   473        assert_that(
   474            destinations, equal_to([destination]), label='CheckDestinations')
   475  
   476        assert_that(jobs, equal_to([job_reference]), label='CheckJobs')
   477  
   478    def test_load_job_id_used(self):
   479      job_reference = bigquery_api.JobReference()
   480      job_reference.projectId = 'loadJobProject'
   481      job_reference.jobId = 'job_name1'
   482  
   483      result_job = bigquery_api.Job()
   484      result_job.jobReference = job_reference
   485  
   486      mock_job = mock.Mock()
   487      mock_job.status.state = 'DONE'
   488      mock_job.status.errorResult = None
   489      mock_job.jobReference = job_reference
   490  
   491      bq_client = mock.Mock()
   492      bq_client.jobs.Get.return_value = mock_job
   493  
   494      bq_client.jobs.Insert.return_value = result_job
   495  
   496      transform = bqfl.BigQueryBatchFileLoads(
   497          'project1:dataset1.table1',
   498          custom_gcs_temp_location=self._new_tempdir(),
   499          test_client=bq_client,
   500          validate=False,
   501          load_job_project_id='loadJobProject')
   502  
   503      with TestPipeline('DirectRunner') as p:
   504        outputs = p | beam.Create(_ELEMENTS) | transform
   505        jobs = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS] \
   506               | "GetJobs" >> beam.Map(lambda x: x[1])
   507  
   508        assert_that(jobs, equal_to([job_reference]), label='CheckJobProjectIds')
   509  
   510    def test_load_job_id_use_for_copy_job(self):
   511      destination = 'project1:dataset1.table1'
   512  
   513      job_reference = bigquery_api.JobReference()
   514      job_reference.projectId = 'loadJobProject'
   515      job_reference.jobId = 'job_name1'
   516      result_job = mock.Mock()
   517      result_job.jobReference = job_reference
   518  
   519      mock_job = mock.Mock()
   520      mock_job.status.state = 'DONE'
   521      mock_job.status.errorResult = None
   522      mock_job.jobReference = job_reference
   523  
   524      bq_client = mock.Mock()
   525      bq_client.jobs.Get.return_value = mock_job
   526  
   527      bq_client.jobs.Insert.return_value = result_job
   528      bq_client.tables.Delete.return_value = None
   529  
   530      with TestPipeline('DirectRunner') as p:
   531        outputs = (
   532            p
   533            | beam.Create(_ELEMENTS, reshuffle=False)
   534            | bqfl.BigQueryBatchFileLoads(
   535                destination,
   536                custom_gcs_temp_location=self._new_tempdir(),
   537                test_client=bq_client,
   538                validate=False,
   539                temp_file_format=bigquery_tools.FileFormat.JSON,
   540                max_file_size=45,
   541                max_partition_size=80,
   542                max_files_per_partition=2,
   543                load_job_project_id='loadJobProject'))
   544  
   545        dest_copy_jobs = outputs[
   546            bqfl.BigQueryBatchFileLoads.DESTINATION_COPY_JOBID_PAIRS]
   547  
   548        copy_jobs = dest_copy_jobs | "GetCopyJobs" >> beam.Map(lambda x: x[1])
   549  
   550        assert_that(
   551            copy_jobs,
   552            equal_to([
   553                job_reference,
   554                job_reference,
   555                job_reference,
   556                job_reference,
   557                job_reference,
   558                job_reference
   559            ]),
   560            label='CheckCopyJobProjectIds')
   561  
   562    @mock.patch('time.sleep')
   563    def test_wait_for_load_job_completion(self, sleep_mock):
   564      job_1 = bigquery_api.Job()
   565      job_1.jobReference = bigquery_api.JobReference()
   566      job_1.jobReference.projectId = 'project1'
   567      job_1.jobReference.jobId = 'jobId1'
   568      job_2 = bigquery_api.Job()
   569      job_2.jobReference = bigquery_api.JobReference()
   570      job_2.jobReference.projectId = 'project1'
   571      job_2.jobReference.jobId = 'jobId2'
   572  
   573      job_1_waiting = mock.Mock()
   574      job_1_waiting.status.state = 'RUNNING'
   575      job_2_done = mock.Mock()
   576      job_2_done.status.state = 'DONE'
   577      job_2_done.status.errorResult = None
   578  
   579      job_1_done = mock.Mock()
   580      job_1_done.status.state = 'DONE'
   581      job_1_done.status.errorResult = None
   582  
   583      bq_client = mock.Mock()
   584      bq_client.jobs.Get.side_effect = [
   585          job_1_waiting, job_2_done, job_1_done, job_2_done
   586      ]
   587      partition_1 = ('project:dataset.table0', ['file0'])
   588      partition_2 = ('project:dataset.table1', ['file1'])
   589      bq_client.jobs.Insert.side_effect = [job_1, job_2]
   590      test_job_prefix = "test_job"
   591  
   592      expected_dest_jobref_list = [(partition_1[0], job_1.jobReference),
   593                                   (partition_2[0], job_2.jobReference)]
   594      with TestPipeline('DirectRunner') as p:
   595        partitions = p | beam.Create([partition_1, partition_2])
   596        outputs = (
   597            partitions
   598            | beam.ParDo(
   599                bqfl.TriggerLoadJobs(test_client=bq_client), test_job_prefix))
   600  
   601        assert_that(outputs, equal_to(expected_dest_jobref_list))
   602  
   603      sleep_mock.assert_called_once()
   604  
   605    @mock.patch('time.sleep')
   606    def test_one_load_job_failed_after_waiting(self, sleep_mock):
   607      job_1 = bigquery_api.Job()
   608      job_1.jobReference = bigquery_api.JobReference()
   609      job_1.jobReference.projectId = 'project1'
   610      job_1.jobReference.jobId = 'jobId1'
   611      job_2 = bigquery_api.Job()
   612      job_2.jobReference = bigquery_api.JobReference()
   613      job_2.jobReference.projectId = 'project1'
   614      job_2.jobReference.jobId = 'jobId2'
   615  
   616      job_1_waiting = mock.Mock()
   617      job_1_waiting.status.state = 'RUNNING'
   618      job_2_done = mock.Mock()
   619      job_2_done.status.state = 'DONE'
   620      job_2_done.status.errorResult = None
   621  
   622      job_1_error = mock.Mock()
   623      job_1_error.status.state = 'DONE'
   624      job_1_error.status.errorResult = 'Some problems happened'
   625  
   626      bq_client = mock.Mock()
   627      bq_client.jobs.Get.side_effect = [
   628          job_1_waiting, job_2_done, job_1_error, job_2_done
   629      ]
   630      partition_1 = ('project:dataset.table0', ['file0'])
   631      partition_2 = ('project:dataset.table1', ['file1'])
   632      bq_client.jobs.Insert.side_effect = [job_1, job_2]
   633      test_job_prefix = "test_job"
   634  
   635      with self.assertRaises(Exception):
   636        with TestPipeline('DirectRunner') as p:
   637          partitions = p | beam.Create([partition_1, partition_2])
   638          _ = (
   639              partitions
   640              | beam.ParDo(
   641                  bqfl.TriggerLoadJobs(test_client=bq_client), test_job_prefix))
   642  
   643      sleep_mock.assert_called_once()
   644  
   645    def test_multiple_partition_files(self):
   646      destination = 'project1:dataset1.table1'
   647  
   648      job_reference = bigquery_api.JobReference()
   649      job_reference.projectId = 'project1'
   650      job_reference.jobId = 'job_name1'
   651      result_job = mock.Mock()
   652      result_job.jobReference = job_reference
   653  
   654      mock_job = mock.Mock()
   655      mock_job.status.state = 'DONE'
   656      mock_job.status.errorResult = None
   657      mock_job.jobReference = job_reference
   658  
   659      bq_client = mock.Mock()
   660      bq_client.jobs.Get.return_value = mock_job
   661  
   662      bq_client.jobs.Insert.return_value = result_job
   663      bq_client.tables.Delete.return_value = None
   664  
   665      with TestPipeline('DirectRunner') as p:
   666        outputs = (
   667            p
   668            | beam.Create(_ELEMENTS, reshuffle=False)
   669            | bqfl.BigQueryBatchFileLoads(
   670                destination,
   671                custom_gcs_temp_location=self._new_tempdir(),
   672                test_client=bq_client,
   673                validate=False,
   674                temp_file_format=bigquery_tools.FileFormat.JSON,
   675                max_file_size=45,
   676                max_partition_size=80,
   677                max_files_per_partition=2))
   678  
   679        dest_files = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
   680        dest_load_jobs = outputs[
   681            bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]
   682        dest_copy_jobs = outputs[
   683            bqfl.BigQueryBatchFileLoads.DESTINATION_COPY_JOBID_PAIRS]
   684  
   685        load_jobs = dest_load_jobs | "GetLoadJobs" >> beam.Map(lambda x: x[1])
   686        copy_jobs = dest_copy_jobs | "GetCopyJobs" >> beam.Map(lambda x: x[1])
   687  
   688        files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1][0])
   689        destinations = (
   690            dest_files
   691            | "GetDests" >> beam.Map(
   692                lambda x: (bigquery_tools.get_hashable_destination(x[0]), x[1]))
   693            | "GetUniques" >> combiners.Count.PerKey()
   694            | "GetFinalDests" >> beam.Keys())
   695  
   696        # All files exist
   697        _ = (
   698            files
   699            | beam.Map(lambda x: hamcrest_assert(os.path.exists(x), is_(True))))
   700  
   701        # One file per destination
   702        assert_that(
   703            files | "CountFiles" >> combiners.Count.Globally(),
   704            equal_to([6]),
   705            label='CheckFileCount')
   706  
   707        assert_that(
   708            destinations, equal_to([destination]), label='CheckDestinations')
   709  
   710        assert_that(
   711            load_jobs | "CountLoadJobs" >> combiners.Count.Globally(),
   712            equal_to([6]),
   713            label='CheckLoadJobCount')
   714        assert_that(
   715            copy_jobs | "CountCopyJobs" >> combiners.Count.Globally(),
   716            equal_to([6]),
   717            label='CheckCopyJobCount')
   718  
   719    @parameterized.expand([
   720        param(write_disposition=BigQueryDisposition.WRITE_TRUNCATE),
   721        param(write_disposition=BigQueryDisposition.WRITE_EMPTY)
   722    ])
   723    @mock.patch(
   724        'apache_beam.io.gcp.bigquery_file_loads.TriggerCopyJobs.process',
   725        wraps=lambda *x: None)
   726    def test_multiple_partition_files_write_dispositions(
   727        self, mock_call_process, write_disposition):
   728      destination = 'project1:dataset1.table1'
   729  
   730      job_reference = bigquery_api.JobReference()
   731      job_reference.projectId = 'project1'
   732      job_reference.jobId = 'job_name1'
   733      result_job = mock.Mock()
   734      result_job.jobReference = job_reference
   735  
   736      mock_job = mock.Mock()
   737      mock_job.status.state = 'DONE'
   738      mock_job.status.errorResult = None
   739      mock_job.jobReference = job_reference
   740  
   741      bq_client = mock.Mock()
   742      bq_client.jobs.Get.return_value = mock_job
   743  
   744      bq_client.jobs.Insert.return_value = result_job
   745      bq_client.tables.Delete.return_value = None
   746  
   747      with TestPipeline('DirectRunner') as p:
   748        _ = (
   749            p
   750            | beam.Create(_ELEMENTS, reshuffle=False)
   751            | bqfl.BigQueryBatchFileLoads(
   752                destination,
   753                custom_gcs_temp_location=self._new_tempdir(),
   754                test_client=bq_client,
   755                validate=False,
   756                temp_file_format=bigquery_tools.FileFormat.JSON,
   757                max_file_size=45,
   758                max_partition_size=80,
   759                max_files_per_partition=2,
   760                write_disposition=write_disposition))
   761      # TriggerCopyJob only processes once
   762      self.assertEqual(mock_call_process.call_count, 1)
   763  
   764    @parameterized.expand([
   765        param(is_streaming=False, with_auto_sharding=False),
   766        param(is_streaming=True, with_auto_sharding=False),
   767        param(is_streaming=True, with_auto_sharding=True),
   768    ])
   769    def test_triggering_frequency(self, is_streaming, with_auto_sharding):
   770      destination = 'project1:dataset1.table1'
   771  
   772      job_reference = bigquery_api.JobReference()
   773      job_reference.projectId = 'project1'
   774      job_reference.jobId = 'job_name1'
   775      result_job = bigquery_api.Job()
   776      result_job.jobReference = job_reference
   777  
   778      mock_job = mock.Mock()
   779      mock_job.status.state = 'DONE'
   780      mock_job.status.errorResult = None
   781      mock_job.jobReference = job_reference
   782  
   783      bq_client = mock.Mock()
   784      bq_client.jobs.Get.return_value = mock_job
   785      bq_client.jobs.Insert.return_value = result_job
   786  
   787      # Insert a fake clock to work with auto-sharding which needs a processing
   788      # time timer.
   789      class _FakeClock(object):
   790        def __init__(self, now=time.time()):
   791          self._now = now
   792  
   793        def __call__(self):
   794          return self._now
   795  
   796      start_time = timestamp.Timestamp(0)
   797      bq_client.test_clock = _FakeClock(now=start_time)
   798  
   799      triggering_frequency = 20 if is_streaming else None
   800      transform = bqfl.BigQueryBatchFileLoads(
   801          destination,
   802          custom_gcs_temp_location=self._new_tempdir(),
   803          test_client=bq_client,
   804          validate=False,
   805          temp_file_format=bigquery_tools.FileFormat.JSON,
   806          is_streaming_pipeline=is_streaming,
   807          triggering_frequency=triggering_frequency,
   808          with_auto_sharding=with_auto_sharding)
   809  
   810      # Need to test this with the DirectRunner to avoid serializing mocks
   811      test_options = PipelineOptions(flags=['--allow_unsafe_triggers'])
   812      test_options.view_as(StandardOptions).streaming = is_streaming
   813      with TestPipeline(runner='BundleBasedDirectRunner',
   814                        options=test_options) as p:
   815        if is_streaming:
   816          _SIZE = len(_ELEMENTS)
   817          fisrt_batch = [
   818              TimestampedValue(value, start_time + i + 1) for i,
   819              value in enumerate(_ELEMENTS[:_SIZE // 2])
   820          ]
   821          second_batch = [
   822              TimestampedValue(value, start_time + _SIZE // 2 + i + 1) for i,
   823              value in enumerate(_ELEMENTS[_SIZE // 2:])
   824          ]
   825          # Advance processing time between batches of input elements to fire the
   826          # user triggers. Intentionally advance the processing time twice for the
   827          # auto-sharding case since we need to first fire the timer and then
   828          # fire the trigger.
   829          test_stream = (
   830              TestStream().advance_watermark_to(start_time).add_elements(
   831                  fisrt_batch).advance_processing_time(
   832                      30).advance_processing_time(30).add_elements(second_batch).
   833              advance_processing_time(30).advance_processing_time(
   834                  30).advance_watermark_to_infinity())
   835          input = p | test_stream
   836        else:
   837          input = p | beam.Create(_ELEMENTS)
   838        outputs = input | transform
   839  
   840        dest_files = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
   841        dest_job = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]
   842  
   843        files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1][0])
   844        destinations = (
   845            dest_files
   846            | "GetDests" >> beam.Map(
   847                lambda x: (bigquery_tools.get_hashable_destination(x[0]), x[1]))
   848            | "GetUniques" >> combiners.Count.PerKey()
   849            | "GetFinalDests" >> beam.Keys())
   850        jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1])
   851  
   852        # Check that all files exist.
   853        _ = (
   854            files
   855            | beam.Map(lambda x: hamcrest_assert(os.path.exists(x), is_(True))))
   856  
   857        # Expect two load jobs are generated in the streaming case due to the
   858        # triggering frequency. Grouping is per trigger so we expect two entries
   859        # in the output as opposed to one.
   860        file_count = files | combiners.Count.Globally().without_defaults()
   861        expected_file_count = [1, 1] if is_streaming else [1]
   862        expected_destinations = [destination, destination
   863                                 ] if is_streaming else [destination]
   864        expected_jobs = [job_reference, job_reference
   865                         ] if is_streaming else [job_reference]
   866        assert_that(file_count, equal_to(expected_file_count), label='CountFiles')
   867        assert_that(
   868            destinations,
   869            equal_to(expected_destinations),
   870            label='CheckDestinations')
   871        assert_that(jobs, equal_to(expected_jobs), label='CheckJobs')
   872  
   873  
   874  class BigQueryFileLoadsIT(unittest.TestCase):
   875  
   876    BIG_QUERY_DATASET_ID = 'python_bq_file_loads_'
   877    BIG_QUERY_SCHEMA = (
   878        '{"fields": [{"name": "name","type": "STRING"},'
   879        '{"name": "language","type": "STRING"}]}')
   880  
   881    BIG_QUERY_SCHEMA_2 = (
   882        '{"fields": [{"name": "name","type": "STRING"},'
   883        '{"name": "foundation","type": "STRING"}]}')
   884  
   885    BIG_QUERY_STREAMING_SCHEMA = ({
   886        'fields': [{
   887            'name': 'Integr', 'type': 'INTEGER', 'mode': 'NULLABLE'
   888        }]
   889    })
   890  
   891    def setUp(self):
   892      self.test_pipeline = TestPipeline(is_integration_test=True)
   893      self.runner_name = type(self.test_pipeline.runner).__name__
   894      self.project = self.test_pipeline.get_option('project')
   895  
   896      self.dataset_id = '%s%d%s' % (
   897          self.BIG_QUERY_DATASET_ID, int(time.time()), secrets.token_hex(3))
   898      self.bigquery_client = bigquery_tools.BigQueryWrapper()
   899      self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
   900      self.output_table = "%s.output_table" % (self.dataset_id)
   901      _LOGGER.info(
   902          "Created dataset %s in project %s", self.dataset_id, self.project)
   903  
   904    @pytest.mark.it_postcommit
   905    def test_multiple_destinations_transform(self):
   906      output_table_1 = '%s%s' % (self.output_table, 1)
   907      output_table_2 = '%s%s' % (self.output_table, 2)
   908      output_table_3 = '%s%s' % (self.output_table, 3)
   909      output_table_4 = '%s%s' % (self.output_table, 4)
   910      schema1 = bigquery.WriteToBigQuery.get_dict_table_schema(
   911          bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA))
   912      schema2 = bigquery.WriteToBigQuery.get_dict_table_schema(
   913          bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2))
   914  
   915      schema_kv_pairs = [(output_table_1, schema1), (output_table_2, schema2),
   916                         (output_table_3, schema1), (output_table_4, schema2)]
   917      pipeline_verifiers = [
   918          BigqueryFullResultMatcher(
   919              project=self.project,
   920              query="SELECT name, language FROM %s" % output_table_1,
   921              data=[(d['name'], d['language']) for d in _ELEMENTS
   922                    if 'language' in d]),
   923          BigqueryFullResultMatcher(
   924              project=self.project,
   925              query="SELECT name, foundation FROM %s" % output_table_2,
   926              data=[(d['name'], d['foundation']) for d in _ELEMENTS
   927                    if 'foundation' in d]),
   928          BigqueryFullResultMatcher(
   929              project=self.project,
   930              query="SELECT name, language FROM %s" % output_table_3,
   931              data=[(d['name'], d['language']) for d in _ELEMENTS
   932                    if 'language' in d]),
   933          BigqueryFullResultMatcher(
   934              project=self.project,
   935              query="SELECT name, foundation FROM %s" % output_table_4,
   936              data=[(d['name'], d['foundation']) for d in _ELEMENTS
   937                    if 'foundation' in d])
   938      ]
   939  
   940      args = self.test_pipeline.get_full_options_as_args(
   941          on_success_matcher=all_of(*pipeline_verifiers))
   942  
   943      with beam.Pipeline(argv=args) as p:
   944        input = p | beam.Create(_ELEMENTS, reshuffle=False)
   945  
   946        schema_map_pcv = beam.pvalue.AsDict(
   947            p | "MakeSchemas" >> beam.Create(schema_kv_pairs))
   948  
   949        table_record_pcv = beam.pvalue.AsDict(
   950            p | "MakeTables" >> beam.Create([('table1', output_table_1),
   951                                             ('table2', output_table_2)]))
   952  
   953        # Get all input in same machine
   954        input = (
   955            input
   956            | beam.Map(lambda x: (None, x))
   957            | beam.GroupByKey()
   958            | beam.FlatMap(lambda elm: elm[1]))
   959  
   960        _ = (
   961            input | "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery(
   962                table=lambda x,
   963                tables:
   964                (tables['table1'] if 'language' in x else tables['table2']),
   965                table_side_inputs=(table_record_pcv, ),
   966                schema=lambda dest,
   967                schema_map: schema_map.get(dest, None),
   968                schema_side_inputs=(schema_map_pcv, ),
   969                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
   970                write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
   971  
   972        _ = (
   973            input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
   974                table=lambda x:
   975                (output_table_3 if 'language' in x else output_table_4),
   976                schema=lambda dest,
   977                schema_map: schema_map.get(dest, None),
   978                schema_side_inputs=(schema_map_pcv, ),
   979                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
   980                write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
   981                max_file_size=20,
   982                max_files_per_bundle=-1))
   983  
   984    @pytest.mark.it_postcommit
   985    def test_bqfl_streaming(self):
   986      if isinstance(self.test_pipeline.runner, TestDataflowRunner):
   987        self.skipTest("TestStream is not supported on TestDataflowRunner")
   988      output_table = '%s_%s' % (self.output_table, 'ints')
   989      _SIZE = 100
   990      schema = self.BIG_QUERY_STREAMING_SCHEMA
   991      l = [{'Integr': i} for i in range(_SIZE)]
   992  
   993      bq_matcher = BigqueryFullResultStreamingMatcher(
   994          project=self.project,
   995          query="SELECT Integr FROM %s" % output_table,
   996          data=[(i, ) for i in range(100)])
   997  
   998      args = self.test_pipeline.get_full_options_as_args(
   999          on_success_matcher=bq_matcher,
  1000          streaming=True,
  1001          allow_unsafe_triggers=True)
  1002      with beam.Pipeline(argv=args) as p:
  1003        stream_source = (
  1004            TestStream().advance_watermark_to(0).advance_processing_time(
  1005                100).add_elements(l[:_SIZE // 4]).
  1006            advance_processing_time(100).advance_watermark_to(100).add_elements(
  1007                l[_SIZE // 4:2 * _SIZE // 4]).advance_processing_time(
  1008                    100).advance_watermark_to(200).add_elements(
  1009                        l[2 * _SIZE // 4:3 * _SIZE // 4]).advance_processing_time(
  1010                            100).advance_watermark_to(300).add_elements(
  1011                                l[3 * _SIZE // 4:]).advance_processing_time(
  1012                                    100).advance_watermark_to_infinity())
  1013        _ = (p
  1014             | stream_source
  1015             | bigquery.WriteToBigQuery(output_table,
  1016                                        schema=schema,
  1017                                        method=bigquery.WriteToBigQuery \
  1018                                          .Method.FILE_LOADS,
  1019                                        triggering_frequency=100))
  1020  
  1021      hamcrest_assert(p, bq_matcher)
  1022  
  1023    @pytest.mark.it_postcommit
  1024    def test_bqfl_streaming_with_copy_jobs(self):
  1025      if isinstance(self.test_pipeline.runner, TestDataflowRunner):
  1026        self.skipTest("TestStream is not supported on TestDataflowRunner")
  1027      output_table = '%s_%s' % (self.output_table, 'with_copy_jobs')
  1028      _SIZE = 100
  1029      schema = self.BIG_QUERY_STREAMING_SCHEMA
  1030      l = [{'Integr': i} for i in range(_SIZE)]
  1031  
  1032      bq_matcher = BigqueryFullResultStreamingMatcher(
  1033          project=self.project,
  1034          query="SELECT Integr FROM %s" % output_table,
  1035          data=[(i, ) for i in range(100)])
  1036  
  1037      args = self.test_pipeline.get_full_options_as_args(
  1038          on_success_matcher=bq_matcher,
  1039          streaming=True,
  1040          allow_unsafe_triggers=True)
  1041  
  1042      # Override these parameters to induce copy jobs
  1043      bqfl._DEFAULT_MAX_FILE_SIZE = 100
  1044      bqfl._MAXIMUM_LOAD_SIZE = 400
  1045  
  1046      with beam.Pipeline(argv=args) as p:
  1047        stream_source = (
  1048            TestStream().advance_watermark_to(0).advance_processing_time(
  1049                100).add_elements(l[:_SIZE // 4]).
  1050            advance_processing_time(100).advance_watermark_to(100).add_elements(
  1051                l[_SIZE // 4:2 * _SIZE // 4]).advance_processing_time(
  1052                    100).advance_watermark_to(200).add_elements(
  1053                        l[2 * _SIZE // 4:3 * _SIZE // 4]).advance_processing_time(
  1054                            100).advance_watermark_to(300).add_elements(
  1055                                l[3 * _SIZE // 4:]).advance_processing_time(100).
  1056            advance_watermark_to_infinity().advance_processing_time(100))
  1057  
  1058        _ = (p
  1059             | stream_source
  1060             | bigquery.WriteToBigQuery(output_table,
  1061                                        schema=schema,
  1062                                        method=bigquery.WriteToBigQuery \
  1063                                        .Method.FILE_LOADS,
  1064                                        triggering_frequency=100))
  1065  
  1066      hamcrest_assert(p, bq_matcher)
  1067  
  1068    @pytest.mark.it_postcommit
  1069    def test_bqfl_streaming_with_dynamic_destinations(self):
  1070      if isinstance(self.test_pipeline.runner, TestDataflowRunner):
  1071        self.skipTest("TestStream is not supported on TestDataflowRunner")
  1072      even_table = '%s_%s' % (self.output_table, "dynamic_dest_0")
  1073      odd_table = '%s_%s' % (self.output_table, "dynamic_dest_1")
  1074      output_table = lambda row: even_table if (
  1075          row['Integr'] % 2 == 0) else odd_table
  1076      _SIZE = 100
  1077      schema = self.BIG_QUERY_STREAMING_SCHEMA
  1078      l = [{'Integr': i} for i in range(_SIZE)]
  1079  
  1080      pipeline_verifiers = [
  1081          BigqueryFullResultStreamingMatcher(
  1082              project=self.project,
  1083              query="SELECT Integr FROM %s" % even_table,
  1084              data=[(i, ) for i in range(0, 100, 2)]),
  1085          BigqueryFullResultStreamingMatcher(
  1086              project=self.project,
  1087              query="SELECT Integr FROM %s" % odd_table,
  1088              data=[(i, ) for i in range(1, 100, 2)])
  1089      ]
  1090  
  1091      args = self.test_pipeline.get_full_options_as_args(
  1092          on_success_matcher=all_of(*pipeline_verifiers),
  1093          streaming=True,
  1094          allow_unsafe_triggers=True)
  1095  
  1096      with beam.Pipeline(argv=args) as p:
  1097        stream_source = (
  1098            TestStream().advance_watermark_to(0).advance_processing_time(
  1099                100).add_elements(l[:_SIZE // 4]).
  1100            advance_processing_time(100).advance_watermark_to(100).add_elements(
  1101                l[_SIZE // 4:2 * _SIZE // 4]).advance_processing_time(
  1102                    100).advance_watermark_to(200).add_elements(
  1103                        l[2 * _SIZE // 4:3 * _SIZE // 4]).advance_processing_time(
  1104                            100).advance_watermark_to(300).add_elements(
  1105                                l[3 * _SIZE // 4:]).advance_processing_time(100).
  1106            advance_watermark_to_infinity().advance_processing_time(100))
  1107  
  1108        _ = (p
  1109             | stream_source
  1110             | bigquery.WriteToBigQuery(output_table,
  1111                                        schema=schema,
  1112                                        method=bigquery.WriteToBigQuery \
  1113                                        .Method.FILE_LOADS,
  1114                                        triggering_frequency=100))
  1115      hamcrest_assert(p, all_of(*pipeline_verifiers))
  1116  
  1117    @pytest.mark.it_postcommit
  1118    def test_one_job_fails_all_jobs_fail(self):
  1119  
  1120      # If one of the import jobs fails, then other jobs must not be performed.
  1121      # This is to avoid reinsertion of some records when a pipeline fails and
  1122      # is rerun.
  1123      output_table_1 = '%s%s' % (self.output_table, 1)
  1124      output_table_2 = '%s%s' % (self.output_table, 2)
  1125  
  1126      self.bigquery_client.get_or_create_table(
  1127          self.project,
  1128          self.dataset_id,
  1129          output_table_1.split('.')[1],
  1130          bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA),
  1131          None,
  1132          None)
  1133      self.bigquery_client.get_or_create_table(
  1134          self.project,
  1135          self.dataset_id,
  1136          output_table_2.split('.')[1],
  1137          bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2),
  1138          None,
  1139          None)
  1140  
  1141      pipeline_verifiers = [
  1142          BigqueryFullResultMatcher(
  1143              project=self.project,
  1144              query="SELECT name, language FROM %s" % output_table_1,
  1145              data=[]),
  1146          BigqueryFullResultMatcher(
  1147              project=self.project,
  1148              query="SELECT name, foundation FROM %s" % output_table_2,
  1149              data=[])
  1150      ]
  1151  
  1152      args = self.test_pipeline.get_full_options_as_args()
  1153  
  1154      with self.assertRaises(Exception):
  1155        # The pipeline below fails because neither a schema nor SCHEMA_AUTODETECT
  1156        # are specified.
  1157        with beam.Pipeline(argv=args) as p:
  1158          input = p | beam.Create(_ELEMENTS)
  1159          input2 = p | "Broken record" >> beam.Create(['language_broken_record'])
  1160  
  1161          input = (input, input2) | beam.Flatten()
  1162  
  1163          _ = (
  1164              input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
  1165                  table=lambda x:
  1166                  (output_table_1 if 'language' in x else output_table_2),
  1167                  create_disposition=(
  1168                      beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
  1169                  write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
  1170                  temp_file_format=bigquery_tools.FileFormat.JSON))
  1171  
  1172      hamcrest_assert(p, all_of(*pipeline_verifiers))
  1173  
  1174    def tearDown(self):
  1175      request = bigquery_api.BigqueryDatasetsDeleteRequest(
  1176          projectId=self.project, datasetId=self.dataset_id, deleteContents=True)
  1177      try:
  1178        _LOGGER.info(
  1179            "Deleting dataset %s in project %s", self.dataset_id, self.project)
  1180        self.bigquery_client.client.datasets.Delete(request)
  1181      except HttpError:
  1182        _LOGGER.debug(
  1183            'Failed to clean up dataset %s in project %s',
  1184            self.dataset_id,
  1185            self.project)
  1186  
  1187  
  1188  if __name__ == '__main__':
  1189    logging.getLogger().setLevel(logging.INFO)
  1190    unittest.main()