github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_file_loads.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  Functionality to perform file loads into BigQuery for Batch and Streaming
    20  pipelines.
    21  
    22  This source is able to work around BigQuery load quotas and limitations. When
    23  destinations are dynamic, or when data for a single job is too large, the data
    24  will be split into multiple jobs.
    25  
    26  NOTHING IN THIS FILE HAS BACKWARDS COMPATIBILITY GUARANTEES.
    27  """
    28  
    29  # pytype: skip-file
    30  
    31  import hashlib
    32  import io
    33  import logging
    34  import random
    35  import time
    36  import uuid
    37  
    38  import apache_beam as beam
    39  from apache_beam import pvalue
    40  from apache_beam.io import filesystems as fs
    41  from apache_beam.io.gcp import bigquery_tools
    42  from apache_beam.io.gcp.bigquery_io_metadata import create_bigquery_io_metadata
    43  from apache_beam.options import value_provider as vp
    44  from apache_beam.options.pipeline_options import GoogleCloudOptions
    45  from apache_beam.transforms import trigger
    46  from apache_beam.transforms.display import DisplayDataItem
    47  from apache_beam.transforms.util import GroupIntoBatches
    48  from apache_beam.transforms.window import GlobalWindows
    49  
    50  # Protect against environments where bigquery library is not available.
    51  # pylint: disable=wrong-import-order, wrong-import-position
    52  try:
    53    from apitools.base.py.exceptions import HttpError
    54  except ImportError:
    55    pass
    56  
    57  _LOGGER = logging.getLogger(__name__)
    58  
    59  ONE_TERABYTE = (1 << 40)
    60  
    61  # The maximum file size for imports is 5TB. We keep our files under that.
    62  _DEFAULT_MAX_FILE_SIZE = 4 * ONE_TERABYTE
    63  
    64  _DEFAULT_MAX_WRITERS_PER_BUNDLE = 20
    65  
    66  # The maximum size for a single load job is one terabyte
    67  _MAXIMUM_LOAD_SIZE = 15 * ONE_TERABYTE
    68  
    69  # Big query only supports up to 10 thousand URIs for a single load job.
    70  _MAXIMUM_SOURCE_URIS = 10 * 1000
    71  
    72  # If triggering_frequency is supplied, we will trigger the file write after
    73  # this many records are written.
    74  _FILE_TRIGGERING_RECORD_COUNT = 500000
    75  
    76  # If using auto-sharding for unbounded data, we batch the records before
    77  # triggering file write to avoid generating too many small files.
    78  _FILE_TRIGGERING_BATCHING_DURATION_SECS = 1
    79  
    80  # How many seconds we wait before polling a pending job
    81  _SLEEP_DURATION_BETWEEN_POLLS = 10
    82  
    83  
    84  def _generate_job_name(job_name, job_type, step_name):
    85    return bigquery_tools.generate_bq_job_name(
    86        job_name=job_name,
    87        step_id=step_name,
    88        job_type=job_type,
    89        random=random.randint(0, 1000))
    90  
    91  
    92  def file_prefix_generator(
    93      with_validation=True, pipeline_gcs_location=None, temp_location=None):
    94    def _generate_file_prefix(unused_elm):
    95      # If a gcs location is provided to the pipeline, then we shall use that.
    96      # Otherwise, we shall use the temp_location from pipeline options.
    97      gcs_base = pipeline_gcs_location.get()
    98      if not gcs_base:
    99        gcs_base = temp_location
   100  
   101      # This will fail at pipeline execution time, but will fail early, as this
   102      # step doesn't have any dependencies (and thus will be one of the first
   103      # stages to be run).
   104      if with_validation and (not gcs_base or not gcs_base.startswith('gs://')):
   105        raise ValueError(
   106            'Invalid GCS location: %r.\n'
   107            'Writing to BigQuery with FILE_LOADS method requires a'
   108            ' GCS location to be provided to write files to be loaded'
   109            ' into BigQuery. Please provide a GCS bucket through'
   110            ' custom_gcs_temp_location in the constructor of WriteToBigQuery'
   111            ' or the fallback option --temp_location, or pass'
   112            ' method="STREAMING_INSERTS" to WriteToBigQuery.' % gcs_base)
   113  
   114      prefix_uuid = _bq_uuid()
   115      return fs.FileSystems.join(gcs_base, 'bq_load', prefix_uuid)
   116  
   117    return _generate_file_prefix
   118  
   119  
   120  def _make_new_file_writer(
   121      file_prefix,
   122      destination,
   123      file_format,
   124      schema=None,
   125      schema_side_inputs=tuple()):
   126    destination = bigquery_tools.get_hashable_destination(destination)
   127  
   128    # Windows does not allow : on filenames. Replacing with underscore.
   129    # Other disallowed characters are:
   130    # https://docs.microsoft.com/en-us/windows/desktop/fileio/naming-a-file
   131    destination = destination.replace(':', '.')
   132  
   133    directory = fs.FileSystems.join(file_prefix, destination)
   134  
   135    if not fs.FileSystems.exists(directory):
   136      fs.FileSystems.mkdirs(directory)
   137  
   138    file_name = str(uuid.uuid4())
   139    file_path = fs.FileSystems.join(file_prefix, destination, file_name)
   140  
   141    if file_format == bigquery_tools.FileFormat.AVRO:
   142      if callable(schema):
   143        schema = schema(destination, *schema_side_inputs)
   144      elif isinstance(schema, vp.ValueProvider):
   145        schema = schema.get()
   146  
   147      writer = bigquery_tools.AvroRowWriter(
   148          fs.FileSystems.create(file_path, "application/avro"), schema)
   149    elif file_format == bigquery_tools.FileFormat.JSON:
   150      writer = bigquery_tools.JsonRowWriter(
   151          fs.FileSystems.create(file_path, "application/text"))
   152    else:
   153      raise ValueError((
   154          'Only AVRO and JSON are supported as intermediate formats for '
   155          'BigQuery WriteRecordsToFile, got: {}.').format(file_format))
   156  
   157    return file_path, writer
   158  
   159  
   160  def _bq_uuid(seed=None):
   161    if not seed:
   162      return str(uuid.uuid4()).replace("-", "")
   163    else:
   164      return str(hashlib.md5(seed.encode('utf8')).hexdigest())
   165  
   166  
   167  class _ShardDestinations(beam.DoFn):
   168    """Adds a shard number to the key of the KV element.
   169  
   170    Experimental; no backwards compatibility guarantees."""
   171    DEFAULT_SHARDING_FACTOR = 10
   172  
   173    def __init__(self, sharding_factor=DEFAULT_SHARDING_FACTOR):
   174      self.sharding_factor = sharding_factor
   175  
   176    def start_bundle(self):
   177      self._shard_count = random.randrange(self.sharding_factor)
   178  
   179    def process(self, element):
   180      destination = element[0]
   181      row = element[1]
   182  
   183      sharded_destination = (
   184          destination, self._shard_count % self.sharding_factor)
   185      self._shard_count += 1
   186      yield (sharded_destination, row)
   187  
   188  
   189  class WriteRecordsToFile(beam.DoFn):
   190    """Write input records to files before triggering a load job.
   191  
   192    This transform keeps up to ``max_files_per_bundle`` files open to write to. It
   193    receives (destination, record) tuples, and it writes the records to different
   194    files for each destination.
   195  
   196    If there are more than ``max_files_per_bundle`` destinations that we need to
   197    write to, then those records are grouped by their destination, and later
   198    written to files by ``WriteGroupedRecordsToFile``.
   199  
   200    It outputs two PCollections.
   201    """
   202  
   203    UNWRITTEN_RECORD_TAG = 'UnwrittenRecords'
   204    WRITTEN_FILE_TAG = 'WrittenFiles'
   205  
   206    def __init__(
   207        self,
   208        schema,
   209        max_files_per_bundle=_DEFAULT_MAX_WRITERS_PER_BUNDLE,
   210        max_file_size=_DEFAULT_MAX_FILE_SIZE,
   211        file_format=None):
   212      """Initialize a :class:`WriteRecordsToFile`.
   213  
   214      Args:
   215        max_files_per_bundle (int): The maximum number of files that can be kept
   216          open during execution of this step in a worker. This is to avoid over-
   217          whelming the worker memory.
   218        max_file_size (int): The maximum size in bytes for a file to be used in
   219          an export job.
   220  
   221      """
   222      self.schema = schema
   223      self.max_files_per_bundle = max_files_per_bundle
   224      self.max_file_size = max_file_size
   225      self.file_format = file_format or bigquery_tools.FileFormat.JSON
   226  
   227    def display_data(self):
   228      return {
   229          'max_files_per_bundle': self.max_files_per_bundle,
   230          'max_file_size': str(self.max_file_size),
   231          'file_format': self.file_format,
   232      }
   233  
   234    def start_bundle(self):
   235      self._destination_to_file_writer = {}
   236  
   237    def process(self, element, file_prefix, *schema_side_inputs):
   238      """Take a tuple with (destination, row) and write to file or spill out.
   239  
   240      Destination may be a ``TableReference`` or a string, and row is a
   241      Python dictionary for a row to be inserted to BigQuery."""
   242      destination = bigquery_tools.get_hashable_destination(element[0])
   243      row = element[1]
   244  
   245      if destination not in self._destination_to_file_writer:
   246        if len(self._destination_to_file_writer) < self.max_files_per_bundle:
   247          self._destination_to_file_writer[destination] = _make_new_file_writer(
   248              file_prefix,
   249              destination,
   250              self.file_format,
   251              self.schema,
   252              schema_side_inputs)
   253        else:
   254          yield pvalue.TaggedOutput(
   255              WriteRecordsToFile.UNWRITTEN_RECORD_TAG, element)
   256          return
   257  
   258      (file_path, writer) = self._destination_to_file_writer[destination]
   259  
   260      # TODO(pabloem): Is it possible for this to throw exception?
   261      writer.write(row)
   262  
   263      file_size = writer.tell()
   264      if file_size > self.max_file_size:
   265        writer.close()
   266        self._destination_to_file_writer.pop(destination)
   267        yield pvalue.TaggedOutput(
   268            WriteRecordsToFile.WRITTEN_FILE_TAG,
   269            (destination, (file_path, file_size)))
   270  
   271    def finish_bundle(self):
   272      for destination, file_path_writer in \
   273          self._destination_to_file_writer.items():
   274        (file_path, writer) = file_path_writer
   275        file_size = writer.tell()
   276        writer.close()
   277        yield pvalue.TaggedOutput(
   278            WriteRecordsToFile.WRITTEN_FILE_TAG,
   279            GlobalWindows.windowed_value((destination, (file_path, file_size))))
   280      self._destination_to_file_writer = {}
   281  
   282  
   283  class WriteGroupedRecordsToFile(beam.DoFn):
   284    """Receives collection of dest-iterable(records), writes it to files.
   285  
   286    This is different from ``WriteRecordsToFile`` because it receives records
   287    grouped by destination. This means that it's not necessary to keep multiple
   288    file descriptors open, because we know for sure when records for a single
   289    destination have been written out.
   290  
   291    Experimental; no backwards compatibility guarantees.
   292    """
   293    def __init__(
   294        self, schema, max_file_size=_DEFAULT_MAX_FILE_SIZE, file_format=None):
   295      self.schema = schema
   296      self.max_file_size = max_file_size
   297      self.file_format = file_format or bigquery_tools.FileFormat.JSON
   298  
   299    def process(self, element, file_prefix, *schema_side_inputs):
   300      destination = bigquery_tools.get_hashable_destination(element[0])
   301      rows = element[1]
   302  
   303      file_path, writer = None, None
   304  
   305      for row in rows:
   306        if writer is None:
   307          (file_path, writer) = _make_new_file_writer(
   308              file_prefix,
   309              destination,
   310              self.file_format,
   311              self.schema,
   312              schema_side_inputs)
   313  
   314        writer.write(row)
   315  
   316        file_size = writer.tell()
   317        if file_size > self.max_file_size:
   318          writer.close()
   319          yield (destination, (file_path, file_size))
   320          file_path, writer = None, None
   321      if writer is not None:
   322        writer.close()
   323        yield (destination, (file_path, file_size))
   324  
   325  
   326  class UpdateDestinationSchema(beam.DoFn):
   327    """Update destination schema based on data that is about to be copied into it.
   328  
   329    Unlike load and query jobs, BigQuery copy jobs do not support schema field
   330    addition or relaxation on the destination table. This DoFn fills that gap by
   331    updating the destination table schemas to be compatible with the data coming
   332    from the source table so that schema field modification options are respected
   333    regardless of whether data is loaded directly to the destination table or
   334    loaded into temporary tables before being copied into the destination.
   335  
   336    This transform takes as input a (destination, job_reference) pair where the
   337    job_reference refers to a completed load job into a temporary table.
   338  
   339    This transform emits (destination, job_reference) pairs where the
   340    job_reference refers to a submitted load job for performing the schema
   341    modification in JSON format. Note that the input and output job references
   342    are not the same.
   343  
   344    Experimental; no backwards compatibility guarantees.
   345    """
   346    def __init__(
   347        self,
   348        project=None,
   349        write_disposition=None,
   350        test_client=None,
   351        additional_bq_parameters=None,
   352        step_name=None,
   353        load_job_project_id=None):
   354      self.project = project
   355      self._test_client = test_client
   356      self._write_disposition = write_disposition
   357      self._additional_bq_parameters = additional_bq_parameters or {}
   358      self._step_name = step_name
   359      self._load_job_project_id = load_job_project_id
   360  
   361    def start_bundle(self):
   362      self.bq_wrapper = bigquery_tools.BigQueryWrapper(client=self._test_client)
   363      self._bq_io_metadata = create_bigquery_io_metadata(self._step_name)
   364      self.pending_jobs = []
   365  
   366    def display_data(self):
   367      return {
   368          'write_disposition': str(self._write_disposition),
   369          'additional_bq_params': str(self._additional_bq_parameters),
   370      }
   371  
   372    def process(self, element, schema_mod_job_name_prefix):
   373      destination = element[0]
   374      temp_table_load_job_reference = element[1]
   375  
   376      if callable(self._additional_bq_parameters):
   377        additional_parameters = self._additional_bq_parameters(destination)
   378      elif isinstance(self._additional_bq_parameters, vp.ValueProvider):
   379        additional_parameters = self._additional_bq_parameters.get()
   380      else:
   381        additional_parameters = self._additional_bq_parameters
   382  
   383      # When writing to normal tables WRITE_TRUNCATE will overwrite the schema but
   384      # when writing to a partition, care needs to be taken to update the schema
   385      # even on WRITE_TRUNCATE.
   386      if (self._write_disposition not in ('WRITE_TRUNCATE', 'WRITE_APPEND') or
   387          not additional_parameters or
   388          not additional_parameters.get("schemaUpdateOptions")):
   389        # No need to modify schema of destination table
   390        return
   391  
   392      table_reference = bigquery_tools.parse_table_reference(destination)
   393      if table_reference.projectId is None:
   394        table_reference.projectId = vp.RuntimeValueProvider.get_value(
   395            'project', str, '') or self.project
   396  
   397      try:
   398        # Check if destination table exists
   399        destination_table = self.bq_wrapper.get_table(
   400            project_id=table_reference.projectId,
   401            dataset_id=table_reference.datasetId,
   402            table_id=table_reference.tableId)
   403      except HttpError as exn:
   404        if exn.status_code == 404:
   405          # Destination table does not exist, so no need to modify its schema
   406          # ahead of the copy jobs.
   407          return
   408        else:
   409          raise
   410  
   411      temp_table_load_job = self.bq_wrapper.get_job(
   412          project=temp_table_load_job_reference.projectId,
   413          job_id=temp_table_load_job_reference.jobId,
   414          location=temp_table_load_job_reference.location)
   415      temp_table_schema = temp_table_load_job.configuration.load.schema
   416  
   417      if bigquery_tools.check_schema_equal(temp_table_schema,
   418                                           destination_table.schema,
   419                                           ignore_descriptions=True,
   420                                           ignore_field_order=True):
   421        # Destination table schema is already the same as the temp table schema,
   422        # so no need to run a job to update the destination table schema.
   423        return
   424  
   425      destination_hash = _bq_uuid(
   426          '%s:%s.%s' % (
   427              table_reference.projectId,
   428              table_reference.datasetId,
   429              table_reference.tableId))
   430      uid = _bq_uuid()
   431      job_name = '%s_%s_%s' % (schema_mod_job_name_prefix, destination_hash, uid)
   432  
   433      _LOGGER.info(
   434          'Triggering schema modification job %s on %s',
   435          job_name,
   436          table_reference)
   437      # Trigger potential schema modification by loading zero rows into the
   438      # destination table with the temporary table schema.
   439      schema_update_job_reference = self.bq_wrapper.perform_load_job(
   440          destination=table_reference,
   441          source_stream=io.BytesIO(),  # file with zero rows
   442          job_id=job_name,
   443          schema=temp_table_schema,
   444          write_disposition='WRITE_APPEND',
   445          create_disposition='CREATE_NEVER',
   446          additional_load_parameters=additional_parameters,
   447          job_labels=self._bq_io_metadata.add_additional_bq_job_labels(),
   448          # JSON format is hardcoded because zero rows load(unlike AVRO) and
   449          # a nested schema(unlike CSV, which a default one) is permitted.
   450          source_format="NEWLINE_DELIMITED_JSON",
   451          load_job_project_id=self._load_job_project_id)
   452      self.pending_jobs.append(
   453          GlobalWindows.windowed_value(
   454              (destination, schema_update_job_reference)))
   455  
   456    def finish_bundle(self):
   457      # Unlike the other steps, schema update is not always necessary.
   458      # In that case, return a None value to avoid blocking in streaming context.
   459      # Otherwise, the streaming pipeline would get stuck waiting for the
   460      # TriggerCopyJobs side-input.
   461      if not self.pending_jobs:
   462        return [GlobalWindows.windowed_value(None)]
   463  
   464      for windowed_value in self.pending_jobs:
   465        job_ref = windowed_value.value[1]
   466        self.bq_wrapper.wait_for_bq_job(
   467            job_ref, sleep_duration_sec=_SLEEP_DURATION_BETWEEN_POLLS)
   468      return self.pending_jobs
   469  
   470  
   471  class TriggerCopyJobs(beam.DoFn):
   472    """Launches jobs to copy from temporary tables into the main target table.
   473  
   474    When a job needs to write to multiple destination tables, or when a single
   475    destination table needs to have multiple load jobs to write to it, files are
   476    loaded into temporary tables, and those tables are later copied to the
   477    destination tables.
   478  
   479    This transform emits (destination, job_reference) pairs.
   480  
   481    TODO(BEAM-7822): In file loads method of writing to BigQuery,
   482      copying from temp_tables to destination_table is not atomic.
   483      See: https://issues.apache.org/jira/browse/BEAM-7822
   484    """
   485  
   486    TRIGGER_DELETE_TEMP_TABLES = 'TriggerDeleteTempTables'
   487  
   488    def __init__(
   489        self,
   490        project=None,
   491        create_disposition=None,
   492        write_disposition=None,
   493        test_client=None,
   494        step_name=None,
   495        load_job_project_id=None):
   496      self.project = project
   497      self.create_disposition = create_disposition
   498      self.write_disposition = write_disposition
   499      self.test_client = test_client
   500      self._observed_tables = set()
   501      self.bq_io_metadata = None
   502      self._step_name = step_name
   503      self.load_job_project_id = load_job_project_id
   504  
   505    def display_data(self):
   506      return {
   507          'launchesBigQueryJobs': DisplayDataItem(
   508              True, label="This Dataflow job launches bigquery jobs.")
   509      }
   510  
   511    def setup(self):
   512      self._observed_tables = set()
   513  
   514    def start_bundle(self):
   515      self.bq_wrapper = bigquery_tools.BigQueryWrapper(client=self.test_client)
   516      if not self.bq_io_metadata:
   517        self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
   518      self.pending_jobs = []
   519  
   520    def process(
   521        self, element_list, job_name_prefix=None, unused_schema_mod_jobs=None):
   522      if isinstance(element_list, tuple):
   523        # Allow this for streaming update compatibility while fixing BEAM-24535.
   524        self.process_one(element_list, job_name_prefix)
   525      else:
   526        for element in element_list:
   527          self.process_one(element, job_name_prefix)
   528  
   529    def process_one(self, element, job_name_prefix):
   530      destination, job_reference = element
   531  
   532      copy_to_reference = bigquery_tools.parse_table_reference(destination)
   533      if copy_to_reference.projectId is None:
   534        copy_to_reference.projectId = vp.RuntimeValueProvider.get_value(
   535            'project', str, '') or self.project
   536  
   537      copy_from_reference = bigquery_tools.parse_table_reference(destination)
   538      copy_from_reference.tableId = job_reference.jobId
   539      if copy_from_reference.projectId is None:
   540        copy_from_reference.projectId = vp.RuntimeValueProvider.get_value(
   541            'project', str, '') or self.project
   542  
   543      copy_job_name = '%s_%s' % (
   544          job_name_prefix,
   545          _bq_uuid(
   546              '%s:%s.%s' % (
   547                  copy_from_reference.projectId,
   548                  copy_from_reference.datasetId,
   549                  copy_from_reference.tableId)))
   550  
   551      _LOGGER.info(
   552          "Triggering copy job from %s to %s",
   553          copy_from_reference,
   554          copy_to_reference)
   555      if copy_to_reference.tableId not in self._observed_tables:
   556        # When the write_disposition for a job is WRITE_TRUNCATE,
   557        # multiple copy jobs to the same destination can stump on
   558        # each other, truncate data, and write to the BQ table over and
   559        # over.
   560        # Thus, the first copy job runs with the user's write_disposition,
   561        # but afterwards, all jobs must always WRITE_APPEND to the table.
   562        # If they do not, subsequent copy jobs will clear out data appended
   563        # by previous jobs.
   564        write_disposition = self.write_disposition
   565        wait_for_job = True
   566        self._observed_tables.add(copy_to_reference.tableId)
   567      else:
   568        wait_for_job = False
   569        write_disposition = 'WRITE_APPEND'
   570  
   571      if not self.bq_io_metadata:
   572        self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
   573  
   574      project_id = (
   575          copy_to_reference.projectId
   576          if self.load_job_project_id is None else self.load_job_project_id)
   577      job_reference = self.bq_wrapper._insert_copy_job(
   578          project_id,
   579          copy_job_name,
   580          copy_from_reference,
   581          copy_to_reference,
   582          create_disposition=self.create_disposition,
   583          write_disposition=write_disposition,
   584          job_labels=self.bq_io_metadata.add_additional_bq_job_labels())
   585  
   586      if wait_for_job:
   587        self.bq_wrapper.wait_for_bq_job(job_reference, sleep_duration_sec=10)
   588      self.pending_jobs.append(
   589          GlobalWindows.windowed_value((destination, job_reference)))
   590  
   591    def finish_bundle(self):
   592      for windowed_value in self.pending_jobs:
   593        job_ref = windowed_value.value[1]
   594        self.bq_wrapper.wait_for_bq_job(
   595            job_ref, sleep_duration_sec=_SLEEP_DURATION_BETWEEN_POLLS)
   596        yield windowed_value
   597  
   598      yield pvalue.TaggedOutput(
   599          TriggerCopyJobs.TRIGGER_DELETE_TEMP_TABLES,
   600          GlobalWindows.windowed_value(None))
   601  
   602  
   603  class TriggerLoadJobs(beam.DoFn):
   604    """Triggers the import jobs to BQ.
   605  
   606    Experimental; no backwards compatibility guarantees.
   607    """
   608  
   609    TEMP_TABLES = 'TemporaryTables'
   610    ONGOING_JOBS = 'OngoingJobs'
   611  
   612    def __init__(
   613        self,
   614        schema=None,
   615        project=None,
   616        create_disposition=None,
   617        write_disposition=None,
   618        test_client=None,
   619        temporary_tables=False,
   620        additional_bq_parameters=None,
   621        source_format=None,
   622        step_name=None,
   623        load_job_project_id=None):
   624      self.schema = schema
   625      self.project = project
   626      self.test_client = test_client
   627      self.temporary_tables = temporary_tables
   628      self.additional_bq_parameters = additional_bq_parameters or {}
   629      self.source_format = source_format
   630      self.bq_io_metadata = None
   631      self._step_name = step_name
   632      self.load_job_project_id = load_job_project_id
   633      if self.temporary_tables:
   634        # If we are loading into temporary tables, we rely on the default create
   635        # and write dispositions, which mean that a new table will be created.
   636        self.create_disposition = None
   637        self.write_disposition = None
   638      else:
   639        self.create_disposition = create_disposition
   640        self.write_disposition = write_disposition
   641  
   642    def display_data(self):
   643      result = {
   644          'create_disposition': str(self.create_disposition),
   645          'write_disposition': str(self.write_disposition),
   646          'additional_bq_params': str(self.additional_bq_parameters),
   647          'schema': str(self.schema),
   648          'launchesBigQueryJobs': DisplayDataItem(
   649              True, label="This Dataflow job launches bigquery jobs."),
   650          'source_format': str(self.source_format),
   651      }
   652      return result
   653  
   654    def start_bundle(self):
   655      self.bq_wrapper = bigquery_tools.BigQueryWrapper(client=self.test_client)
   656      if not self.bq_io_metadata:
   657        self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
   658      self.pending_jobs = []
   659  
   660    def process(self, element, load_job_name_prefix, *schema_side_inputs):
   661      # Each load job is assumed to have files respecting these constraints:
   662      # 1. Total size of all files < 15 TB (Max size for load jobs)
   663      # 2. Total no. of files in a single load job < 10,000
   664      # This assumption means that there will always be a single load job
   665      # triggered for each partition of files.
   666      destination = element[0]
   667      files = element[1]
   668  
   669      if callable(self.schema):
   670        schema = self.schema(destination, *schema_side_inputs)
   671      elif isinstance(self.schema, vp.ValueProvider):
   672        schema = self.schema.get()
   673      else:
   674        schema = self.schema
   675  
   676      if callable(self.additional_bq_parameters):
   677        additional_parameters = self.additional_bq_parameters(destination)
   678      elif isinstance(self.additional_bq_parameters, vp.ValueProvider):
   679        additional_parameters = self.additional_bq_parameters.get()
   680      else:
   681        additional_parameters = self.additional_bq_parameters
   682  
   683      table_reference = bigquery_tools.parse_table_reference(destination)
   684      if table_reference.projectId is None:
   685        table_reference.projectId = vp.RuntimeValueProvider.get_value(
   686            'project', str, '') or self.project
   687      # Load jobs for a single destination are always triggered from the same
   688      # worker. This means that we can generate a deterministic numbered job id,
   689      # and not need to worry.
   690      destination_hash = _bq_uuid(
   691          '%s:%s.%s' % (
   692              table_reference.projectId,
   693              table_reference.datasetId,
   694              table_reference.tableId))
   695      uid = _bq_uuid()
   696      job_name = '%s_%s_%s' % (load_job_name_prefix, destination_hash, uid)
   697      _LOGGER.info('Load job has %s files. Job name is %s.', len(files), job_name)
   698  
   699      create_disposition = self.create_disposition
   700      if self.temporary_tables:
   701        # If we are using temporary tables, then we must always create the
   702        # temporary tables, so we replace the create_disposition.
   703        create_disposition = 'CREATE_IF_NEEDED'
   704        # For temporary tables, we create a new table with the name with JobId.
   705        table_reference.tableId = job_name
   706        yield pvalue.TaggedOutput(
   707            TriggerLoadJobs.TEMP_TABLES,
   708            bigquery_tools.get_hashable_destination(table_reference))
   709  
   710      _LOGGER.info(
   711          'Triggering job %s to load data to BigQuery table %s.'
   712          'Schema: %s. Additional parameters: %s. Source format: %s',
   713          job_name,
   714          table_reference,
   715          schema,
   716          additional_parameters,
   717          self.source_format,
   718      )
   719      if not self.bq_io_metadata:
   720        self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
   721      job_reference = self.bq_wrapper.perform_load_job(
   722          destination=table_reference,
   723          source_uris=files,
   724          job_id=job_name,
   725          schema=schema,
   726          write_disposition=self.write_disposition,
   727          create_disposition=create_disposition,
   728          additional_load_parameters=additional_parameters,
   729          source_format=self.source_format,
   730          job_labels=self.bq_io_metadata.add_additional_bq_job_labels(),
   731          load_job_project_id=self.load_job_project_id)
   732      yield pvalue.TaggedOutput(
   733          TriggerLoadJobs.ONGOING_JOBS, (destination, job_reference))
   734      self.pending_jobs.append(
   735          GlobalWindows.windowed_value((destination, job_reference)))
   736  
   737    def finish_bundle(self):
   738      for windowed_value in self.pending_jobs:
   739        job_ref = windowed_value.value[1]
   740        self.bq_wrapper.wait_for_bq_job(
   741            job_ref, sleep_duration_sec=_SLEEP_DURATION_BETWEEN_POLLS)
   742      return self.pending_jobs
   743  
   744  
   745  class PartitionFiles(beam.DoFn):
   746  
   747    MULTIPLE_PARTITIONS_TAG = 'MULTIPLE_PARTITIONS'
   748    SINGLE_PARTITION_TAG = 'SINGLE_PARTITION'
   749  
   750    class Partition(object):
   751      def __init__(self, max_size, max_files, files=None, size=0):
   752        self.max_size = max_size
   753        self.max_files = max_files
   754        self.files = files if files is not None else []
   755        self.size = size
   756  
   757      def can_accept(self, file_size, no_of_files=1):
   758        if (((self.size + file_size) <= self.max_size) and
   759            ((len(self.files) + no_of_files) <= self.max_files)):
   760          return True
   761        else:
   762          return False
   763  
   764      def add(self, file_path, file_size):
   765        self.files.append(file_path)
   766        self.size += file_size
   767  
   768    def __init__(self, max_partition_size, max_files_per_partition):
   769      self.max_partition_size = max_partition_size
   770      self.max_files_per_partition = max_files_per_partition
   771  
   772    def process(self, element):
   773      destination = element[0]
   774      files = element[1]
   775      partitions = []
   776  
   777      if not files:
   778        _LOGGER.warning(
   779            'Ignoring a BigQuery batch load partition to %s '
   780            'that contains no source URIs.',
   781            destination)
   782        return
   783  
   784      latest_partition = PartitionFiles.Partition(
   785          self.max_partition_size, self.max_files_per_partition)
   786  
   787      for file_path, file_size in files:
   788        if latest_partition.can_accept(file_size):
   789          latest_partition.add(file_path, file_size)
   790        else:
   791          partitions.append(latest_partition.files)
   792          latest_partition = PartitionFiles.Partition(
   793              self.max_partition_size, self.max_files_per_partition)
   794          latest_partition.add(file_path, file_size)
   795      partitions.append(latest_partition.files)
   796  
   797      if len(partitions) > 1:
   798        output_tag = PartitionFiles.MULTIPLE_PARTITIONS_TAG
   799      else:
   800        output_tag = PartitionFiles.SINGLE_PARTITION_TAG
   801  
   802      for partition in partitions:
   803        yield pvalue.TaggedOutput(output_tag, (destination, partition))
   804  
   805  
   806  class DeleteTablesFn(beam.DoFn):
   807    def __init__(self, test_client=None):
   808      self.test_client = test_client
   809  
   810    def start_bundle(self):
   811      self.bq_wrapper = bigquery_tools.BigQueryWrapper(client=self.test_client)
   812  
   813    def process(self, table_reference):
   814      _LOGGER.info("Deleting table %s", table_reference)
   815      table_reference = bigquery_tools.parse_table_reference(table_reference)
   816      self.bq_wrapper._delete_table(
   817          table_reference.projectId,
   818          table_reference.datasetId,
   819          table_reference.tableId)
   820  
   821  
   822  class BigQueryBatchFileLoads(beam.PTransform):
   823    """Takes in a set of elements, and inserts them to BigQuery via batch loads.
   824  
   825    """
   826  
   827    DESTINATION_JOBID_PAIRS = 'destination_load_jobid_pairs'
   828    DESTINATION_FILE_PAIRS = 'destination_file_pairs'
   829    DESTINATION_COPY_JOBID_PAIRS = 'destination_copy_jobid_pairs'
   830    COUNT = 0
   831  
   832    def __init__(
   833        self,
   834        destination,
   835        project=None,
   836        schema=None,
   837        custom_gcs_temp_location=None,
   838        create_disposition=None,
   839        write_disposition=None,
   840        triggering_frequency=None,
   841        with_auto_sharding=False,
   842        temp_file_format=None,
   843        max_file_size=None,
   844        max_files_per_bundle=None,
   845        max_partition_size=None,
   846        max_files_per_partition=None,
   847        additional_bq_parameters=None,
   848        table_side_inputs=None,
   849        schema_side_inputs=None,
   850        test_client=None,
   851        validate=True,
   852        is_streaming_pipeline=False,
   853        load_job_project_id=None):
   854      self.destination = destination
   855      self.project = project
   856      self.create_disposition = create_disposition
   857      self.write_disposition = write_disposition
   858      self.triggering_frequency = triggering_frequency
   859      self.with_auto_sharding = with_auto_sharding
   860      self.max_file_size = max_file_size or _DEFAULT_MAX_FILE_SIZE
   861      self.max_files_per_bundle = (
   862          max_files_per_bundle or _DEFAULT_MAX_WRITERS_PER_BUNDLE)
   863      self.max_partition_size = max_partition_size or _MAXIMUM_LOAD_SIZE
   864      self.max_files_per_partition = (
   865          max_files_per_partition or _MAXIMUM_SOURCE_URIS)
   866      if (isinstance(custom_gcs_temp_location, str) or
   867          custom_gcs_temp_location is None):
   868        self._custom_gcs_temp_location = vp.StaticValueProvider(
   869            str, custom_gcs_temp_location or '')
   870      elif isinstance(custom_gcs_temp_location, vp.ValueProvider):
   871        self._custom_gcs_temp_location = custom_gcs_temp_location
   872      else:
   873        raise ValueError('custom_gcs_temp_location must be str or ValueProvider')
   874  
   875      self.test_client = test_client
   876      self.schema = schema
   877      self._temp_file_format = temp_file_format or bigquery_tools.FileFormat.JSON
   878  
   879      # If we have multiple destinations, then we will have multiple load jobs,
   880      # thus we will need temporary tables for atomicity.
   881      self.dynamic_destinations = bool(callable(destination))
   882  
   883      self.additional_bq_parameters = additional_bq_parameters or {}
   884      self.table_side_inputs = table_side_inputs or ()
   885      self.schema_side_inputs = schema_side_inputs or ()
   886  
   887      self.is_streaming_pipeline = is_streaming_pipeline
   888      self.load_job_project_id = load_job_project_id
   889      self._validate = validate
   890      if self._validate:
   891        self.verify()
   892  
   893    def verify(self):
   894      if (isinstance(self._custom_gcs_temp_location.get(), vp.StaticValueProvider)
   895          and not self._custom_gcs_temp_location.get().startswith('gs://')):
   896        # Only fail if the custom location is provided, and it is not a GCS
   897        # location.
   898        raise ValueError(
   899            'Invalid GCS location: %r.\n'
   900            'Writing to BigQuery with FILE_LOADS method requires a '
   901            'GCS location to be provided to write files to be '
   902            'loaded into BigQuery. Please provide a GCS bucket, or '
   903            'pass method="STREAMING_INSERTS" to WriteToBigQuery.' %
   904            self._custom_gcs_temp_location.get())
   905      if self.is_streaming_pipeline and not self.triggering_frequency:
   906        raise ValueError(
   907            'triggering_frequency must be specified to use file'
   908            'loads in streaming')
   909      elif not self.is_streaming_pipeline and self.triggering_frequency:
   910        raise ValueError(
   911            'triggering_frequency can only be used with file'
   912            'loads in streaming')
   913      if not self.is_streaming_pipeline and self.with_auto_sharding:
   914        return ValueError(
   915            'with_auto_sharding can only be used with file loads in streaming.')
   916  
   917    def _window_fn(self):
   918      """Set the correct WindowInto PTransform"""
   919  
   920      # The user-supplied triggering_frequency is often chosen to control how
   921      # many BigQuery load jobs are triggered, to prevent going over BigQuery's
   922      # daily quota for load jobs. If this is set to a large value, currently we
   923      # have to buffer all the data until the trigger fires. Instead we ensure
   924      # that the files are written if a threshold number of records are ready.
   925      # We use only the user-supplied trigger on the actual BigQuery load.
   926      # This allows us to offload the data to the filesystem.
   927      #
   928      # In the case of dynamic sharding, however, we use a default trigger since
   929      # the transform performs sharding also batches elements to avoid generating
   930      # too many tiny files. User trigger is applied right after writes to limit
   931      # the number of load jobs.
   932      if self.is_streaming_pipeline and not self.with_auto_sharding:
   933        return beam.WindowInto(beam.window.GlobalWindows(),
   934                               trigger=trigger.Repeatedly(
   935                                   trigger.AfterAny(
   936                                       trigger.AfterProcessingTime(
   937                                           self.triggering_frequency),
   938                                       trigger.AfterCount(
   939                                           _FILE_TRIGGERING_RECORD_COUNT))),
   940                               accumulation_mode=trigger.AccumulationMode\
   941                                   .DISCARDING)
   942      else:
   943        return beam.WindowInto(beam.window.GlobalWindows())
   944  
   945    def _maybe_apply_user_trigger(self, destination_file_kv_pc):
   946      if self.is_streaming_pipeline:
   947        # Apply the user's trigger back before we start triggering load jobs
   948        return (
   949            destination_file_kv_pc
   950            | "ApplyUserTrigger" >> beam.WindowInto(
   951                beam.window.GlobalWindows(),
   952                trigger=trigger.Repeatedly(
   953                    trigger.AfterAll(
   954                        trigger.AfterProcessingTime(self.triggering_frequency),
   955                        trigger.AfterCount(1))),
   956                accumulation_mode=trigger.AccumulationMode.DISCARDING))
   957      else:
   958        return destination_file_kv_pc
   959  
   960    def _write_files(self, destination_data_kv_pc, file_prefix_pcv):
   961      outputs = (
   962          destination_data_kv_pc
   963          | beam.ParDo(
   964              WriteRecordsToFile(
   965                  schema=self.schema,
   966                  max_files_per_bundle=self.max_files_per_bundle,
   967                  max_file_size=self.max_file_size,
   968                  file_format=self._temp_file_format),
   969              file_prefix_pcv,
   970              *self.schema_side_inputs).with_outputs(
   971                  WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
   972                  WriteRecordsToFile.WRITTEN_FILE_TAG))
   973  
   974      # A PCollection of (destination, file) tuples. It lists files with records,
   975      # and the destination each file is meant to be imported into.
   976      destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]
   977  
   978      # A PCollection of (destination, record) tuples. These are later sharded,
   979      # grouped, and all records for each destination-shard is written to files.
   980      # This PCollection is necessary because not all records can be written into
   981      # files in ``WriteRecordsToFile``.
   982      unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]
   983  
   984      more_destination_files_kv_pc = (
   985          unwritten_records_pc
   986          | beam.ParDo(_ShardDestinations())
   987          | "GroupShardedRows" >> beam.GroupByKey()
   988          | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
   989          | "WriteGroupedRecordsToFile" >> beam.ParDo(
   990              WriteGroupedRecordsToFile(
   991                  schema=self.schema, file_format=self._temp_file_format),
   992              file_prefix_pcv,
   993              *self.schema_side_inputs))
   994  
   995      # TODO(https://github.com/apache/beam/issues/20285): Remove the identity
   996      # transform. We flatten both PCollection paths and use an identity function
   997      # to work around a flatten optimization issue where the wrong coder is
   998      # being used.
   999      all_destination_file_pairs_pc = (
  1000          (destination_files_kv_pc, more_destination_files_kv_pc)
  1001          | "DestinationFilesUnion" >> beam.Flatten()
  1002          | "IdentityWorkaround" >> beam.Map(lambda x: x))
  1003      return self._maybe_apply_user_trigger(all_destination_file_pairs_pc)
  1004  
  1005    def _write_files_with_auto_sharding(
  1006        self, destination_data_kv_pc, file_prefix_pcv):
  1007      clock = self.test_client.test_clock if self.test_client else time.time
  1008  
  1009      # Auto-sharding is achieved via GroupIntoBatches.WithShardedKey
  1010      # transform which shards, groups and at the same time batches the table rows
  1011      # to be inserted to BigQuery.
  1012  
  1013      # Firstly, the keys of tagged_data (table references) are converted to a
  1014      # hashable format. This is needed to work with the keyed states used by.
  1015      # GroupIntoBatches. After grouping and batching is done, table references
  1016      # are restored.
  1017      destination_files_kv_pc = (
  1018          destination_data_kv_pc
  1019          |
  1020          'ToHashableTableRef' >> beam.Map(bigquery_tools.to_hashable_table_ref)
  1021          | 'WithAutoSharding' >> GroupIntoBatches.WithShardedKey(
  1022              batch_size=_FILE_TRIGGERING_RECORD_COUNT,
  1023              max_buffering_duration_secs=_FILE_TRIGGERING_BATCHING_DURATION_SECS,
  1024              clock=clock)
  1025          | 'FromHashableTableRefAndDropShard' >> beam.Map(
  1026              lambda kvs:
  1027              (bigquery_tools.parse_table_reference(kvs[0].key), kvs[1]))
  1028          | beam.ParDo(
  1029              WriteGroupedRecordsToFile(
  1030                  schema=self.schema, file_format=self._temp_file_format),
  1031              file_prefix_pcv,
  1032              *self.schema_side_inputs))
  1033  
  1034      return self._maybe_apply_user_trigger(destination_files_kv_pc)
  1035  
  1036    def _load_data(
  1037        self,
  1038        partitions_using_temp_tables,
  1039        partitions_direct_to_destination,
  1040        load_job_name_pcv,
  1041        schema_mod_job_name_pcv,
  1042        copy_job_name_pcv,
  1043        p,
  1044        step_name):
  1045      """Load data to BigQuery
  1046  
  1047      Data is loaded into BigQuery in the following two ways:
  1048        1. Single partition:
  1049           When there is a single partition of files destined to a single
  1050           destination, a single load job is triggered.
  1051        2. Multiple partitions and/or Dynamic Destinations:
  1052           When there are multiple partitions of files destined for a single
  1053           destination or when Dynamic Destinations are used, multiple load jobs
  1054           need to be triggered for each partition/destination. Load Jobs are
  1055           triggered to temporary tables, and those are later copied to the actual
  1056           appropriate destination table. This ensures atomicity when only some
  1057           of the load jobs would fail but not other. If any of them fails, then
  1058           copy jobs are not triggered.
  1059      """
  1060      # Load data using temp tables
  1061      trigger_loads_outputs = (
  1062          partitions_using_temp_tables
  1063          | "TriggerLoadJobsWithTempTables" >> beam.ParDo(
  1064              TriggerLoadJobs(
  1065                  schema=self.schema,
  1066                  project=self.project,
  1067                  write_disposition=self.write_disposition,
  1068                  create_disposition=self.create_disposition,
  1069                  test_client=self.test_client,
  1070                  temporary_tables=True,
  1071                  additional_bq_parameters=self.additional_bq_parameters,
  1072                  source_format=self._temp_file_format,
  1073                  step_name=step_name,
  1074                  load_job_project_id=self.load_job_project_id),
  1075              load_job_name_pcv,
  1076              *self.schema_side_inputs).with_outputs(
  1077                  TriggerLoadJobs.TEMP_TABLES,
  1078                  TriggerLoadJobs.ONGOING_JOBS,
  1079                  main='main'))
  1080  
  1081      finished_temp_tables_load_job_ids_pc = trigger_loads_outputs['main']
  1082      temp_tables_load_job_ids_pc = trigger_loads_outputs[
  1083          TriggerLoadJobs.ONGOING_JOBS]
  1084      temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]
  1085  
  1086      schema_mod_job_ids_pc = (
  1087          finished_temp_tables_load_job_ids_pc
  1088          | beam.ParDo(
  1089              UpdateDestinationSchema(
  1090                  project=self.project,
  1091                  write_disposition=self.write_disposition,
  1092                  test_client=self.test_client,
  1093                  additional_bq_parameters=self.additional_bq_parameters,
  1094                  step_name=step_name,
  1095                  load_job_project_id=self.load_job_project_id),
  1096              schema_mod_job_name_pcv))
  1097  
  1098      if self.write_disposition in ('WRITE_EMPTY', 'WRITE_TRUNCATE'):
  1099        # All loads going to the same table must be processed together so that
  1100        # the truncation happens only once. See
  1101        # https://github.com/apache/beam/issues/24535.
  1102        finished_temp_tables_load_job_ids_list_pc = (
  1103            finished_temp_tables_load_job_ids_pc | beam.MapTuple(
  1104                lambda destination,
  1105                job_reference: (
  1106                    bigquery_tools.parse_table_reference(destination).tableId,
  1107                    (destination, job_reference)))
  1108            | beam.GroupByKey()
  1109            | beam.MapTuple(lambda tableId, batch: list(batch)))
  1110      else:
  1111        # Loads can happen in parallel.
  1112        finished_temp_tables_load_job_ids_list_pc = (
  1113            finished_temp_tables_load_job_ids_pc | beam.Map(lambda x: [x]))
  1114  
  1115      copy_job_outputs = (
  1116          finished_temp_tables_load_job_ids_list_pc
  1117          | beam.ParDo(
  1118              TriggerCopyJobs(
  1119                  project=self.project,
  1120                  create_disposition=self.create_disposition,
  1121                  write_disposition=self.write_disposition,
  1122                  test_client=self.test_client,
  1123                  step_name=step_name,
  1124                  load_job_project_id=self.load_job_project_id),
  1125              copy_job_name_pcv,
  1126              pvalue.AsIter(schema_mod_job_ids_pc)).with_outputs(
  1127                  TriggerCopyJobs.TRIGGER_DELETE_TEMP_TABLES, main='main'))
  1128  
  1129      destination_copy_job_ids_pc = copy_job_outputs['main']
  1130      trigger_delete = copy_job_outputs[
  1131          TriggerCopyJobs.TRIGGER_DELETE_TEMP_TABLES]
  1132  
  1133      _ = (
  1134          temp_tables_pc
  1135          | "RemoveTempTables/AddUselessValue" >> beam.Map(
  1136              lambda x, unused_trigger: (x, None), pvalue.AsList(trigger_delete))
  1137          | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
  1138          | "RemoveTempTables/GetTableNames" >> beam.Keys()
  1139          | "RemoveTempTables/Delete" >> beam.ParDo(
  1140              DeleteTablesFn(self.test_client)))
  1141  
  1142      # Load data directly to destination table
  1143      destination_load_job_ids_pc = (
  1144          partitions_direct_to_destination
  1145          | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo(
  1146              TriggerLoadJobs(
  1147                  schema=self.schema,
  1148                  write_disposition=self.write_disposition,
  1149                  create_disposition=self.create_disposition,
  1150                  test_client=self.test_client,
  1151                  temporary_tables=False,
  1152                  additional_bq_parameters=self.additional_bq_parameters,
  1153                  source_format=self._temp_file_format,
  1154                  step_name=step_name,
  1155                  load_job_project_id=self.load_job_project_id),
  1156              load_job_name_pcv,
  1157              *self.schema_side_inputs).with_outputs(
  1158                  TriggerLoadJobs.ONGOING_JOBS, main='main')
  1159      )[TriggerLoadJobs.ONGOING_JOBS]
  1160  
  1161      destination_load_job_ids_pc = (
  1162          (temp_tables_load_job_ids_pc, destination_load_job_ids_pc)
  1163          | beam.Flatten())
  1164  
  1165      return destination_load_job_ids_pc, destination_copy_job_ids_pc
  1166  
  1167    def expand(self, pcoll):
  1168      p = pcoll.pipeline
  1169      self.project = self.project or p.options.view_as(GoogleCloudOptions).project
  1170      try:
  1171        step_name = self.label
  1172      except AttributeError:
  1173        step_name = 'BigQueryBatchFileLoads_%d' % BigQueryBatchFileLoads.COUNT
  1174        BigQueryBatchFileLoads.COUNT += 1
  1175  
  1176      temp_location = p.options.view_as(GoogleCloudOptions).temp_location
  1177      job_name = (
  1178          p.options.view_as(GoogleCloudOptions).job_name or 'AUTOMATIC_JOB_NAME')
  1179  
  1180      empty_pc = p | "ImpulseEmptyPC" >> beam.Create([])
  1181      singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None])
  1182  
  1183      load_job_name_pcv = pvalue.AsSingleton(
  1184          singleton_pc
  1185          | "LoadJobNamePrefix" >> beam.Map(
  1186              lambda _: _generate_job_name(
  1187                  job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'LOAD_STEP')))
  1188  
  1189      schema_mod_job_name_pcv = pvalue.AsSingleton(
  1190          singleton_pc
  1191          | "SchemaModJobNamePrefix" >> beam.Map(
  1192              lambda _: _generate_job_name(
  1193                  job_name,
  1194                  bigquery_tools.BigQueryJobTypes.LOAD,
  1195                  'SCHEMA_MOD_STEP')))
  1196  
  1197      copy_job_name_pcv = pvalue.AsSingleton(
  1198          singleton_pc
  1199          | "CopyJobNamePrefix" >> beam.Map(
  1200              lambda _: _generate_job_name(
  1201                  job_name, bigquery_tools.BigQueryJobTypes.COPY, 'COPY_STEP')))
  1202  
  1203      file_prefix_pcv = pvalue.AsSingleton(
  1204          singleton_pc
  1205          | "GenerateFilePrefix" >> beam.Map(
  1206              file_prefix_generator(
  1207                  self._validate, self._custom_gcs_temp_location, temp_location)))
  1208  
  1209      destination_data_kv_pc = (
  1210          pcoll
  1211          | "RewindowIntoGlobal" >> self._window_fn()
  1212          | "AppendDestination" >> beam.ParDo(
  1213              bigquery_tools.AppendDestinationsFn(self.destination),
  1214              *self.table_side_inputs))
  1215  
  1216      if not self.with_auto_sharding:
  1217        all_destination_file_pairs_pc = self._write_files(
  1218            destination_data_kv_pc, file_prefix_pcv)
  1219      else:
  1220        all_destination_file_pairs_pc = self._write_files_with_auto_sharding(
  1221            destination_data_kv_pc, file_prefix_pcv)
  1222  
  1223      grouped_files_pc = (
  1224          all_destination_file_pairs_pc
  1225          | "GroupFilesByTableDestinations" >> beam.GroupByKey())
  1226  
  1227      partitions = (
  1228          grouped_files_pc
  1229          | beam.ParDo(
  1230              PartitionFiles(
  1231                  self.max_partition_size,
  1232                  self.max_files_per_partition)).with_outputs(
  1233                      PartitionFiles.MULTIPLE_PARTITIONS_TAG,
  1234                      PartitionFiles.SINGLE_PARTITION_TAG))
  1235  
  1236      multiple_partitions_per_destination_pc = partitions[
  1237          PartitionFiles.MULTIPLE_PARTITIONS_TAG]
  1238      single_partition_per_destination_pc = partitions[
  1239          PartitionFiles.SINGLE_PARTITION_TAG]
  1240  
  1241      # When using dynamic destinations, elements with both single as well as
  1242      # multiple partitions are loaded into BigQuery using temporary tables to
  1243      # ensure atomicity.
  1244      if self.dynamic_destinations:
  1245        all_partitions = ((
  1246            multiple_partitions_per_destination_pc,
  1247            single_partition_per_destination_pc)
  1248                          | "FlattenPartitions" >> beam.Flatten())
  1249        destination_load_job_ids_pc, destination_copy_job_ids_pc = (
  1250            self._load_data(all_partitions,
  1251                            empty_pc,
  1252                            load_job_name_pcv,
  1253                            schema_mod_job_name_pcv,
  1254                            copy_job_name_pcv,
  1255                            p,
  1256                            step_name))
  1257      else:
  1258        destination_load_job_ids_pc, destination_copy_job_ids_pc = (
  1259            self._load_data(multiple_partitions_per_destination_pc,
  1260                            single_partition_per_destination_pc,
  1261                            load_job_name_pcv,
  1262                            schema_mod_job_name_pcv,
  1263                            copy_job_name_pcv,
  1264                            p,
  1265                            step_name))
  1266  
  1267      return {
  1268          self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc,
  1269          self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
  1270          self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
  1271      }