github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_file_loads.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """ 19 Functionality to perform file loads into BigQuery for Batch and Streaming 20 pipelines. 21 22 This source is able to work around BigQuery load quotas and limitations. When 23 destinations are dynamic, or when data for a single job is too large, the data 24 will be split into multiple jobs. 25 26 NOTHING IN THIS FILE HAS BACKWARDS COMPATIBILITY GUARANTEES. 27 """ 28 29 # pytype: skip-file 30 31 import hashlib 32 import io 33 import logging 34 import random 35 import time 36 import uuid 37 38 import apache_beam as beam 39 from apache_beam import pvalue 40 from apache_beam.io import filesystems as fs 41 from apache_beam.io.gcp import bigquery_tools 42 from apache_beam.io.gcp.bigquery_io_metadata import create_bigquery_io_metadata 43 from apache_beam.options import value_provider as vp 44 from apache_beam.options.pipeline_options import GoogleCloudOptions 45 from apache_beam.transforms import trigger 46 from apache_beam.transforms.display import DisplayDataItem 47 from apache_beam.transforms.util import GroupIntoBatches 48 from apache_beam.transforms.window import GlobalWindows 49 50 # Protect against environments where bigquery library is not available. 51 # pylint: disable=wrong-import-order, wrong-import-position 52 try: 53 from apitools.base.py.exceptions import HttpError 54 except ImportError: 55 pass 56 57 _LOGGER = logging.getLogger(__name__) 58 59 ONE_TERABYTE = (1 << 40) 60 61 # The maximum file size for imports is 5TB. We keep our files under that. 62 _DEFAULT_MAX_FILE_SIZE = 4 * ONE_TERABYTE 63 64 _DEFAULT_MAX_WRITERS_PER_BUNDLE = 20 65 66 # The maximum size for a single load job is one terabyte 67 _MAXIMUM_LOAD_SIZE = 15 * ONE_TERABYTE 68 69 # Big query only supports up to 10 thousand URIs for a single load job. 70 _MAXIMUM_SOURCE_URIS = 10 * 1000 71 72 # If triggering_frequency is supplied, we will trigger the file write after 73 # this many records are written. 74 _FILE_TRIGGERING_RECORD_COUNT = 500000 75 76 # If using auto-sharding for unbounded data, we batch the records before 77 # triggering file write to avoid generating too many small files. 78 _FILE_TRIGGERING_BATCHING_DURATION_SECS = 1 79 80 # How many seconds we wait before polling a pending job 81 _SLEEP_DURATION_BETWEEN_POLLS = 10 82 83 84 def _generate_job_name(job_name, job_type, step_name): 85 return bigquery_tools.generate_bq_job_name( 86 job_name=job_name, 87 step_id=step_name, 88 job_type=job_type, 89 random=random.randint(0, 1000)) 90 91 92 def file_prefix_generator( 93 with_validation=True, pipeline_gcs_location=None, temp_location=None): 94 def _generate_file_prefix(unused_elm): 95 # If a gcs location is provided to the pipeline, then we shall use that. 96 # Otherwise, we shall use the temp_location from pipeline options. 97 gcs_base = pipeline_gcs_location.get() 98 if not gcs_base: 99 gcs_base = temp_location 100 101 # This will fail at pipeline execution time, but will fail early, as this 102 # step doesn't have any dependencies (and thus will be one of the first 103 # stages to be run). 104 if with_validation and (not gcs_base or not gcs_base.startswith('gs://')): 105 raise ValueError( 106 'Invalid GCS location: %r.\n' 107 'Writing to BigQuery with FILE_LOADS method requires a' 108 ' GCS location to be provided to write files to be loaded' 109 ' into BigQuery. Please provide a GCS bucket through' 110 ' custom_gcs_temp_location in the constructor of WriteToBigQuery' 111 ' or the fallback option --temp_location, or pass' 112 ' method="STREAMING_INSERTS" to WriteToBigQuery.' % gcs_base) 113 114 prefix_uuid = _bq_uuid() 115 return fs.FileSystems.join(gcs_base, 'bq_load', prefix_uuid) 116 117 return _generate_file_prefix 118 119 120 def _make_new_file_writer( 121 file_prefix, 122 destination, 123 file_format, 124 schema=None, 125 schema_side_inputs=tuple()): 126 destination = bigquery_tools.get_hashable_destination(destination) 127 128 # Windows does not allow : on filenames. Replacing with underscore. 129 # Other disallowed characters are: 130 # https://docs.microsoft.com/en-us/windows/desktop/fileio/naming-a-file 131 destination = destination.replace(':', '.') 132 133 directory = fs.FileSystems.join(file_prefix, destination) 134 135 if not fs.FileSystems.exists(directory): 136 fs.FileSystems.mkdirs(directory) 137 138 file_name = str(uuid.uuid4()) 139 file_path = fs.FileSystems.join(file_prefix, destination, file_name) 140 141 if file_format == bigquery_tools.FileFormat.AVRO: 142 if callable(schema): 143 schema = schema(destination, *schema_side_inputs) 144 elif isinstance(schema, vp.ValueProvider): 145 schema = schema.get() 146 147 writer = bigquery_tools.AvroRowWriter( 148 fs.FileSystems.create(file_path, "application/avro"), schema) 149 elif file_format == bigquery_tools.FileFormat.JSON: 150 writer = bigquery_tools.JsonRowWriter( 151 fs.FileSystems.create(file_path, "application/text")) 152 else: 153 raise ValueError(( 154 'Only AVRO and JSON are supported as intermediate formats for ' 155 'BigQuery WriteRecordsToFile, got: {}.').format(file_format)) 156 157 return file_path, writer 158 159 160 def _bq_uuid(seed=None): 161 if not seed: 162 return str(uuid.uuid4()).replace("-", "") 163 else: 164 return str(hashlib.md5(seed.encode('utf8')).hexdigest()) 165 166 167 class _ShardDestinations(beam.DoFn): 168 """Adds a shard number to the key of the KV element. 169 170 Experimental; no backwards compatibility guarantees.""" 171 DEFAULT_SHARDING_FACTOR = 10 172 173 def __init__(self, sharding_factor=DEFAULT_SHARDING_FACTOR): 174 self.sharding_factor = sharding_factor 175 176 def start_bundle(self): 177 self._shard_count = random.randrange(self.sharding_factor) 178 179 def process(self, element): 180 destination = element[0] 181 row = element[1] 182 183 sharded_destination = ( 184 destination, self._shard_count % self.sharding_factor) 185 self._shard_count += 1 186 yield (sharded_destination, row) 187 188 189 class WriteRecordsToFile(beam.DoFn): 190 """Write input records to files before triggering a load job. 191 192 This transform keeps up to ``max_files_per_bundle`` files open to write to. It 193 receives (destination, record) tuples, and it writes the records to different 194 files for each destination. 195 196 If there are more than ``max_files_per_bundle`` destinations that we need to 197 write to, then those records are grouped by their destination, and later 198 written to files by ``WriteGroupedRecordsToFile``. 199 200 It outputs two PCollections. 201 """ 202 203 UNWRITTEN_RECORD_TAG = 'UnwrittenRecords' 204 WRITTEN_FILE_TAG = 'WrittenFiles' 205 206 def __init__( 207 self, 208 schema, 209 max_files_per_bundle=_DEFAULT_MAX_WRITERS_PER_BUNDLE, 210 max_file_size=_DEFAULT_MAX_FILE_SIZE, 211 file_format=None): 212 """Initialize a :class:`WriteRecordsToFile`. 213 214 Args: 215 max_files_per_bundle (int): The maximum number of files that can be kept 216 open during execution of this step in a worker. This is to avoid over- 217 whelming the worker memory. 218 max_file_size (int): The maximum size in bytes for a file to be used in 219 an export job. 220 221 """ 222 self.schema = schema 223 self.max_files_per_bundle = max_files_per_bundle 224 self.max_file_size = max_file_size 225 self.file_format = file_format or bigquery_tools.FileFormat.JSON 226 227 def display_data(self): 228 return { 229 'max_files_per_bundle': self.max_files_per_bundle, 230 'max_file_size': str(self.max_file_size), 231 'file_format': self.file_format, 232 } 233 234 def start_bundle(self): 235 self._destination_to_file_writer = {} 236 237 def process(self, element, file_prefix, *schema_side_inputs): 238 """Take a tuple with (destination, row) and write to file or spill out. 239 240 Destination may be a ``TableReference`` or a string, and row is a 241 Python dictionary for a row to be inserted to BigQuery.""" 242 destination = bigquery_tools.get_hashable_destination(element[0]) 243 row = element[1] 244 245 if destination not in self._destination_to_file_writer: 246 if len(self._destination_to_file_writer) < self.max_files_per_bundle: 247 self._destination_to_file_writer[destination] = _make_new_file_writer( 248 file_prefix, 249 destination, 250 self.file_format, 251 self.schema, 252 schema_side_inputs) 253 else: 254 yield pvalue.TaggedOutput( 255 WriteRecordsToFile.UNWRITTEN_RECORD_TAG, element) 256 return 257 258 (file_path, writer) = self._destination_to_file_writer[destination] 259 260 # TODO(pabloem): Is it possible for this to throw exception? 261 writer.write(row) 262 263 file_size = writer.tell() 264 if file_size > self.max_file_size: 265 writer.close() 266 self._destination_to_file_writer.pop(destination) 267 yield pvalue.TaggedOutput( 268 WriteRecordsToFile.WRITTEN_FILE_TAG, 269 (destination, (file_path, file_size))) 270 271 def finish_bundle(self): 272 for destination, file_path_writer in \ 273 self._destination_to_file_writer.items(): 274 (file_path, writer) = file_path_writer 275 file_size = writer.tell() 276 writer.close() 277 yield pvalue.TaggedOutput( 278 WriteRecordsToFile.WRITTEN_FILE_TAG, 279 GlobalWindows.windowed_value((destination, (file_path, file_size)))) 280 self._destination_to_file_writer = {} 281 282 283 class WriteGroupedRecordsToFile(beam.DoFn): 284 """Receives collection of dest-iterable(records), writes it to files. 285 286 This is different from ``WriteRecordsToFile`` because it receives records 287 grouped by destination. This means that it's not necessary to keep multiple 288 file descriptors open, because we know for sure when records for a single 289 destination have been written out. 290 291 Experimental; no backwards compatibility guarantees. 292 """ 293 def __init__( 294 self, schema, max_file_size=_DEFAULT_MAX_FILE_SIZE, file_format=None): 295 self.schema = schema 296 self.max_file_size = max_file_size 297 self.file_format = file_format or bigquery_tools.FileFormat.JSON 298 299 def process(self, element, file_prefix, *schema_side_inputs): 300 destination = bigquery_tools.get_hashable_destination(element[0]) 301 rows = element[1] 302 303 file_path, writer = None, None 304 305 for row in rows: 306 if writer is None: 307 (file_path, writer) = _make_new_file_writer( 308 file_prefix, 309 destination, 310 self.file_format, 311 self.schema, 312 schema_side_inputs) 313 314 writer.write(row) 315 316 file_size = writer.tell() 317 if file_size > self.max_file_size: 318 writer.close() 319 yield (destination, (file_path, file_size)) 320 file_path, writer = None, None 321 if writer is not None: 322 writer.close() 323 yield (destination, (file_path, file_size)) 324 325 326 class UpdateDestinationSchema(beam.DoFn): 327 """Update destination schema based on data that is about to be copied into it. 328 329 Unlike load and query jobs, BigQuery copy jobs do not support schema field 330 addition or relaxation on the destination table. This DoFn fills that gap by 331 updating the destination table schemas to be compatible with the data coming 332 from the source table so that schema field modification options are respected 333 regardless of whether data is loaded directly to the destination table or 334 loaded into temporary tables before being copied into the destination. 335 336 This transform takes as input a (destination, job_reference) pair where the 337 job_reference refers to a completed load job into a temporary table. 338 339 This transform emits (destination, job_reference) pairs where the 340 job_reference refers to a submitted load job for performing the schema 341 modification in JSON format. Note that the input and output job references 342 are not the same. 343 344 Experimental; no backwards compatibility guarantees. 345 """ 346 def __init__( 347 self, 348 project=None, 349 write_disposition=None, 350 test_client=None, 351 additional_bq_parameters=None, 352 step_name=None, 353 load_job_project_id=None): 354 self.project = project 355 self._test_client = test_client 356 self._write_disposition = write_disposition 357 self._additional_bq_parameters = additional_bq_parameters or {} 358 self._step_name = step_name 359 self._load_job_project_id = load_job_project_id 360 361 def start_bundle(self): 362 self.bq_wrapper = bigquery_tools.BigQueryWrapper(client=self._test_client) 363 self._bq_io_metadata = create_bigquery_io_metadata(self._step_name) 364 self.pending_jobs = [] 365 366 def display_data(self): 367 return { 368 'write_disposition': str(self._write_disposition), 369 'additional_bq_params': str(self._additional_bq_parameters), 370 } 371 372 def process(self, element, schema_mod_job_name_prefix): 373 destination = element[0] 374 temp_table_load_job_reference = element[1] 375 376 if callable(self._additional_bq_parameters): 377 additional_parameters = self._additional_bq_parameters(destination) 378 elif isinstance(self._additional_bq_parameters, vp.ValueProvider): 379 additional_parameters = self._additional_bq_parameters.get() 380 else: 381 additional_parameters = self._additional_bq_parameters 382 383 # When writing to normal tables WRITE_TRUNCATE will overwrite the schema but 384 # when writing to a partition, care needs to be taken to update the schema 385 # even on WRITE_TRUNCATE. 386 if (self._write_disposition not in ('WRITE_TRUNCATE', 'WRITE_APPEND') or 387 not additional_parameters or 388 not additional_parameters.get("schemaUpdateOptions")): 389 # No need to modify schema of destination table 390 return 391 392 table_reference = bigquery_tools.parse_table_reference(destination) 393 if table_reference.projectId is None: 394 table_reference.projectId = vp.RuntimeValueProvider.get_value( 395 'project', str, '') or self.project 396 397 try: 398 # Check if destination table exists 399 destination_table = self.bq_wrapper.get_table( 400 project_id=table_reference.projectId, 401 dataset_id=table_reference.datasetId, 402 table_id=table_reference.tableId) 403 except HttpError as exn: 404 if exn.status_code == 404: 405 # Destination table does not exist, so no need to modify its schema 406 # ahead of the copy jobs. 407 return 408 else: 409 raise 410 411 temp_table_load_job = self.bq_wrapper.get_job( 412 project=temp_table_load_job_reference.projectId, 413 job_id=temp_table_load_job_reference.jobId, 414 location=temp_table_load_job_reference.location) 415 temp_table_schema = temp_table_load_job.configuration.load.schema 416 417 if bigquery_tools.check_schema_equal(temp_table_schema, 418 destination_table.schema, 419 ignore_descriptions=True, 420 ignore_field_order=True): 421 # Destination table schema is already the same as the temp table schema, 422 # so no need to run a job to update the destination table schema. 423 return 424 425 destination_hash = _bq_uuid( 426 '%s:%s.%s' % ( 427 table_reference.projectId, 428 table_reference.datasetId, 429 table_reference.tableId)) 430 uid = _bq_uuid() 431 job_name = '%s_%s_%s' % (schema_mod_job_name_prefix, destination_hash, uid) 432 433 _LOGGER.info( 434 'Triggering schema modification job %s on %s', 435 job_name, 436 table_reference) 437 # Trigger potential schema modification by loading zero rows into the 438 # destination table with the temporary table schema. 439 schema_update_job_reference = self.bq_wrapper.perform_load_job( 440 destination=table_reference, 441 source_stream=io.BytesIO(), # file with zero rows 442 job_id=job_name, 443 schema=temp_table_schema, 444 write_disposition='WRITE_APPEND', 445 create_disposition='CREATE_NEVER', 446 additional_load_parameters=additional_parameters, 447 job_labels=self._bq_io_metadata.add_additional_bq_job_labels(), 448 # JSON format is hardcoded because zero rows load(unlike AVRO) and 449 # a nested schema(unlike CSV, which a default one) is permitted. 450 source_format="NEWLINE_DELIMITED_JSON", 451 load_job_project_id=self._load_job_project_id) 452 self.pending_jobs.append( 453 GlobalWindows.windowed_value( 454 (destination, schema_update_job_reference))) 455 456 def finish_bundle(self): 457 # Unlike the other steps, schema update is not always necessary. 458 # In that case, return a None value to avoid blocking in streaming context. 459 # Otherwise, the streaming pipeline would get stuck waiting for the 460 # TriggerCopyJobs side-input. 461 if not self.pending_jobs: 462 return [GlobalWindows.windowed_value(None)] 463 464 for windowed_value in self.pending_jobs: 465 job_ref = windowed_value.value[1] 466 self.bq_wrapper.wait_for_bq_job( 467 job_ref, sleep_duration_sec=_SLEEP_DURATION_BETWEEN_POLLS) 468 return self.pending_jobs 469 470 471 class TriggerCopyJobs(beam.DoFn): 472 """Launches jobs to copy from temporary tables into the main target table. 473 474 When a job needs to write to multiple destination tables, or when a single 475 destination table needs to have multiple load jobs to write to it, files are 476 loaded into temporary tables, and those tables are later copied to the 477 destination tables. 478 479 This transform emits (destination, job_reference) pairs. 480 481 TODO(BEAM-7822): In file loads method of writing to BigQuery, 482 copying from temp_tables to destination_table is not atomic. 483 See: https://issues.apache.org/jira/browse/BEAM-7822 484 """ 485 486 TRIGGER_DELETE_TEMP_TABLES = 'TriggerDeleteTempTables' 487 488 def __init__( 489 self, 490 project=None, 491 create_disposition=None, 492 write_disposition=None, 493 test_client=None, 494 step_name=None, 495 load_job_project_id=None): 496 self.project = project 497 self.create_disposition = create_disposition 498 self.write_disposition = write_disposition 499 self.test_client = test_client 500 self._observed_tables = set() 501 self.bq_io_metadata = None 502 self._step_name = step_name 503 self.load_job_project_id = load_job_project_id 504 505 def display_data(self): 506 return { 507 'launchesBigQueryJobs': DisplayDataItem( 508 True, label="This Dataflow job launches bigquery jobs.") 509 } 510 511 def setup(self): 512 self._observed_tables = set() 513 514 def start_bundle(self): 515 self.bq_wrapper = bigquery_tools.BigQueryWrapper(client=self.test_client) 516 if not self.bq_io_metadata: 517 self.bq_io_metadata = create_bigquery_io_metadata(self._step_name) 518 self.pending_jobs = [] 519 520 def process( 521 self, element_list, job_name_prefix=None, unused_schema_mod_jobs=None): 522 if isinstance(element_list, tuple): 523 # Allow this for streaming update compatibility while fixing BEAM-24535. 524 self.process_one(element_list, job_name_prefix) 525 else: 526 for element in element_list: 527 self.process_one(element, job_name_prefix) 528 529 def process_one(self, element, job_name_prefix): 530 destination, job_reference = element 531 532 copy_to_reference = bigquery_tools.parse_table_reference(destination) 533 if copy_to_reference.projectId is None: 534 copy_to_reference.projectId = vp.RuntimeValueProvider.get_value( 535 'project', str, '') or self.project 536 537 copy_from_reference = bigquery_tools.parse_table_reference(destination) 538 copy_from_reference.tableId = job_reference.jobId 539 if copy_from_reference.projectId is None: 540 copy_from_reference.projectId = vp.RuntimeValueProvider.get_value( 541 'project', str, '') or self.project 542 543 copy_job_name = '%s_%s' % ( 544 job_name_prefix, 545 _bq_uuid( 546 '%s:%s.%s' % ( 547 copy_from_reference.projectId, 548 copy_from_reference.datasetId, 549 copy_from_reference.tableId))) 550 551 _LOGGER.info( 552 "Triggering copy job from %s to %s", 553 copy_from_reference, 554 copy_to_reference) 555 if copy_to_reference.tableId not in self._observed_tables: 556 # When the write_disposition for a job is WRITE_TRUNCATE, 557 # multiple copy jobs to the same destination can stump on 558 # each other, truncate data, and write to the BQ table over and 559 # over. 560 # Thus, the first copy job runs with the user's write_disposition, 561 # but afterwards, all jobs must always WRITE_APPEND to the table. 562 # If they do not, subsequent copy jobs will clear out data appended 563 # by previous jobs. 564 write_disposition = self.write_disposition 565 wait_for_job = True 566 self._observed_tables.add(copy_to_reference.tableId) 567 else: 568 wait_for_job = False 569 write_disposition = 'WRITE_APPEND' 570 571 if not self.bq_io_metadata: 572 self.bq_io_metadata = create_bigquery_io_metadata(self._step_name) 573 574 project_id = ( 575 copy_to_reference.projectId 576 if self.load_job_project_id is None else self.load_job_project_id) 577 job_reference = self.bq_wrapper._insert_copy_job( 578 project_id, 579 copy_job_name, 580 copy_from_reference, 581 copy_to_reference, 582 create_disposition=self.create_disposition, 583 write_disposition=write_disposition, 584 job_labels=self.bq_io_metadata.add_additional_bq_job_labels()) 585 586 if wait_for_job: 587 self.bq_wrapper.wait_for_bq_job(job_reference, sleep_duration_sec=10) 588 self.pending_jobs.append( 589 GlobalWindows.windowed_value((destination, job_reference))) 590 591 def finish_bundle(self): 592 for windowed_value in self.pending_jobs: 593 job_ref = windowed_value.value[1] 594 self.bq_wrapper.wait_for_bq_job( 595 job_ref, sleep_duration_sec=_SLEEP_DURATION_BETWEEN_POLLS) 596 yield windowed_value 597 598 yield pvalue.TaggedOutput( 599 TriggerCopyJobs.TRIGGER_DELETE_TEMP_TABLES, 600 GlobalWindows.windowed_value(None)) 601 602 603 class TriggerLoadJobs(beam.DoFn): 604 """Triggers the import jobs to BQ. 605 606 Experimental; no backwards compatibility guarantees. 607 """ 608 609 TEMP_TABLES = 'TemporaryTables' 610 ONGOING_JOBS = 'OngoingJobs' 611 612 def __init__( 613 self, 614 schema=None, 615 project=None, 616 create_disposition=None, 617 write_disposition=None, 618 test_client=None, 619 temporary_tables=False, 620 additional_bq_parameters=None, 621 source_format=None, 622 step_name=None, 623 load_job_project_id=None): 624 self.schema = schema 625 self.project = project 626 self.test_client = test_client 627 self.temporary_tables = temporary_tables 628 self.additional_bq_parameters = additional_bq_parameters or {} 629 self.source_format = source_format 630 self.bq_io_metadata = None 631 self._step_name = step_name 632 self.load_job_project_id = load_job_project_id 633 if self.temporary_tables: 634 # If we are loading into temporary tables, we rely on the default create 635 # and write dispositions, which mean that a new table will be created. 636 self.create_disposition = None 637 self.write_disposition = None 638 else: 639 self.create_disposition = create_disposition 640 self.write_disposition = write_disposition 641 642 def display_data(self): 643 result = { 644 'create_disposition': str(self.create_disposition), 645 'write_disposition': str(self.write_disposition), 646 'additional_bq_params': str(self.additional_bq_parameters), 647 'schema': str(self.schema), 648 'launchesBigQueryJobs': DisplayDataItem( 649 True, label="This Dataflow job launches bigquery jobs."), 650 'source_format': str(self.source_format), 651 } 652 return result 653 654 def start_bundle(self): 655 self.bq_wrapper = bigquery_tools.BigQueryWrapper(client=self.test_client) 656 if not self.bq_io_metadata: 657 self.bq_io_metadata = create_bigquery_io_metadata(self._step_name) 658 self.pending_jobs = [] 659 660 def process(self, element, load_job_name_prefix, *schema_side_inputs): 661 # Each load job is assumed to have files respecting these constraints: 662 # 1. Total size of all files < 15 TB (Max size for load jobs) 663 # 2. Total no. of files in a single load job < 10,000 664 # This assumption means that there will always be a single load job 665 # triggered for each partition of files. 666 destination = element[0] 667 files = element[1] 668 669 if callable(self.schema): 670 schema = self.schema(destination, *schema_side_inputs) 671 elif isinstance(self.schema, vp.ValueProvider): 672 schema = self.schema.get() 673 else: 674 schema = self.schema 675 676 if callable(self.additional_bq_parameters): 677 additional_parameters = self.additional_bq_parameters(destination) 678 elif isinstance(self.additional_bq_parameters, vp.ValueProvider): 679 additional_parameters = self.additional_bq_parameters.get() 680 else: 681 additional_parameters = self.additional_bq_parameters 682 683 table_reference = bigquery_tools.parse_table_reference(destination) 684 if table_reference.projectId is None: 685 table_reference.projectId = vp.RuntimeValueProvider.get_value( 686 'project', str, '') or self.project 687 # Load jobs for a single destination are always triggered from the same 688 # worker. This means that we can generate a deterministic numbered job id, 689 # and not need to worry. 690 destination_hash = _bq_uuid( 691 '%s:%s.%s' % ( 692 table_reference.projectId, 693 table_reference.datasetId, 694 table_reference.tableId)) 695 uid = _bq_uuid() 696 job_name = '%s_%s_%s' % (load_job_name_prefix, destination_hash, uid) 697 _LOGGER.info('Load job has %s files. Job name is %s.', len(files), job_name) 698 699 create_disposition = self.create_disposition 700 if self.temporary_tables: 701 # If we are using temporary tables, then we must always create the 702 # temporary tables, so we replace the create_disposition. 703 create_disposition = 'CREATE_IF_NEEDED' 704 # For temporary tables, we create a new table with the name with JobId. 705 table_reference.tableId = job_name 706 yield pvalue.TaggedOutput( 707 TriggerLoadJobs.TEMP_TABLES, 708 bigquery_tools.get_hashable_destination(table_reference)) 709 710 _LOGGER.info( 711 'Triggering job %s to load data to BigQuery table %s.' 712 'Schema: %s. Additional parameters: %s. Source format: %s', 713 job_name, 714 table_reference, 715 schema, 716 additional_parameters, 717 self.source_format, 718 ) 719 if not self.bq_io_metadata: 720 self.bq_io_metadata = create_bigquery_io_metadata(self._step_name) 721 job_reference = self.bq_wrapper.perform_load_job( 722 destination=table_reference, 723 source_uris=files, 724 job_id=job_name, 725 schema=schema, 726 write_disposition=self.write_disposition, 727 create_disposition=create_disposition, 728 additional_load_parameters=additional_parameters, 729 source_format=self.source_format, 730 job_labels=self.bq_io_metadata.add_additional_bq_job_labels(), 731 load_job_project_id=self.load_job_project_id) 732 yield pvalue.TaggedOutput( 733 TriggerLoadJobs.ONGOING_JOBS, (destination, job_reference)) 734 self.pending_jobs.append( 735 GlobalWindows.windowed_value((destination, job_reference))) 736 737 def finish_bundle(self): 738 for windowed_value in self.pending_jobs: 739 job_ref = windowed_value.value[1] 740 self.bq_wrapper.wait_for_bq_job( 741 job_ref, sleep_duration_sec=_SLEEP_DURATION_BETWEEN_POLLS) 742 return self.pending_jobs 743 744 745 class PartitionFiles(beam.DoFn): 746 747 MULTIPLE_PARTITIONS_TAG = 'MULTIPLE_PARTITIONS' 748 SINGLE_PARTITION_TAG = 'SINGLE_PARTITION' 749 750 class Partition(object): 751 def __init__(self, max_size, max_files, files=None, size=0): 752 self.max_size = max_size 753 self.max_files = max_files 754 self.files = files if files is not None else [] 755 self.size = size 756 757 def can_accept(self, file_size, no_of_files=1): 758 if (((self.size + file_size) <= self.max_size) and 759 ((len(self.files) + no_of_files) <= self.max_files)): 760 return True 761 else: 762 return False 763 764 def add(self, file_path, file_size): 765 self.files.append(file_path) 766 self.size += file_size 767 768 def __init__(self, max_partition_size, max_files_per_partition): 769 self.max_partition_size = max_partition_size 770 self.max_files_per_partition = max_files_per_partition 771 772 def process(self, element): 773 destination = element[0] 774 files = element[1] 775 partitions = [] 776 777 if not files: 778 _LOGGER.warning( 779 'Ignoring a BigQuery batch load partition to %s ' 780 'that contains no source URIs.', 781 destination) 782 return 783 784 latest_partition = PartitionFiles.Partition( 785 self.max_partition_size, self.max_files_per_partition) 786 787 for file_path, file_size in files: 788 if latest_partition.can_accept(file_size): 789 latest_partition.add(file_path, file_size) 790 else: 791 partitions.append(latest_partition.files) 792 latest_partition = PartitionFiles.Partition( 793 self.max_partition_size, self.max_files_per_partition) 794 latest_partition.add(file_path, file_size) 795 partitions.append(latest_partition.files) 796 797 if len(partitions) > 1: 798 output_tag = PartitionFiles.MULTIPLE_PARTITIONS_TAG 799 else: 800 output_tag = PartitionFiles.SINGLE_PARTITION_TAG 801 802 for partition in partitions: 803 yield pvalue.TaggedOutput(output_tag, (destination, partition)) 804 805 806 class DeleteTablesFn(beam.DoFn): 807 def __init__(self, test_client=None): 808 self.test_client = test_client 809 810 def start_bundle(self): 811 self.bq_wrapper = bigquery_tools.BigQueryWrapper(client=self.test_client) 812 813 def process(self, table_reference): 814 _LOGGER.info("Deleting table %s", table_reference) 815 table_reference = bigquery_tools.parse_table_reference(table_reference) 816 self.bq_wrapper._delete_table( 817 table_reference.projectId, 818 table_reference.datasetId, 819 table_reference.tableId) 820 821 822 class BigQueryBatchFileLoads(beam.PTransform): 823 """Takes in a set of elements, and inserts them to BigQuery via batch loads. 824 825 """ 826 827 DESTINATION_JOBID_PAIRS = 'destination_load_jobid_pairs' 828 DESTINATION_FILE_PAIRS = 'destination_file_pairs' 829 DESTINATION_COPY_JOBID_PAIRS = 'destination_copy_jobid_pairs' 830 COUNT = 0 831 832 def __init__( 833 self, 834 destination, 835 project=None, 836 schema=None, 837 custom_gcs_temp_location=None, 838 create_disposition=None, 839 write_disposition=None, 840 triggering_frequency=None, 841 with_auto_sharding=False, 842 temp_file_format=None, 843 max_file_size=None, 844 max_files_per_bundle=None, 845 max_partition_size=None, 846 max_files_per_partition=None, 847 additional_bq_parameters=None, 848 table_side_inputs=None, 849 schema_side_inputs=None, 850 test_client=None, 851 validate=True, 852 is_streaming_pipeline=False, 853 load_job_project_id=None): 854 self.destination = destination 855 self.project = project 856 self.create_disposition = create_disposition 857 self.write_disposition = write_disposition 858 self.triggering_frequency = triggering_frequency 859 self.with_auto_sharding = with_auto_sharding 860 self.max_file_size = max_file_size or _DEFAULT_MAX_FILE_SIZE 861 self.max_files_per_bundle = ( 862 max_files_per_bundle or _DEFAULT_MAX_WRITERS_PER_BUNDLE) 863 self.max_partition_size = max_partition_size or _MAXIMUM_LOAD_SIZE 864 self.max_files_per_partition = ( 865 max_files_per_partition or _MAXIMUM_SOURCE_URIS) 866 if (isinstance(custom_gcs_temp_location, str) or 867 custom_gcs_temp_location is None): 868 self._custom_gcs_temp_location = vp.StaticValueProvider( 869 str, custom_gcs_temp_location or '') 870 elif isinstance(custom_gcs_temp_location, vp.ValueProvider): 871 self._custom_gcs_temp_location = custom_gcs_temp_location 872 else: 873 raise ValueError('custom_gcs_temp_location must be str or ValueProvider') 874 875 self.test_client = test_client 876 self.schema = schema 877 self._temp_file_format = temp_file_format or bigquery_tools.FileFormat.JSON 878 879 # If we have multiple destinations, then we will have multiple load jobs, 880 # thus we will need temporary tables for atomicity. 881 self.dynamic_destinations = bool(callable(destination)) 882 883 self.additional_bq_parameters = additional_bq_parameters or {} 884 self.table_side_inputs = table_side_inputs or () 885 self.schema_side_inputs = schema_side_inputs or () 886 887 self.is_streaming_pipeline = is_streaming_pipeline 888 self.load_job_project_id = load_job_project_id 889 self._validate = validate 890 if self._validate: 891 self.verify() 892 893 def verify(self): 894 if (isinstance(self._custom_gcs_temp_location.get(), vp.StaticValueProvider) 895 and not self._custom_gcs_temp_location.get().startswith('gs://')): 896 # Only fail if the custom location is provided, and it is not a GCS 897 # location. 898 raise ValueError( 899 'Invalid GCS location: %r.\n' 900 'Writing to BigQuery with FILE_LOADS method requires a ' 901 'GCS location to be provided to write files to be ' 902 'loaded into BigQuery. Please provide a GCS bucket, or ' 903 'pass method="STREAMING_INSERTS" to WriteToBigQuery.' % 904 self._custom_gcs_temp_location.get()) 905 if self.is_streaming_pipeline and not self.triggering_frequency: 906 raise ValueError( 907 'triggering_frequency must be specified to use file' 908 'loads in streaming') 909 elif not self.is_streaming_pipeline and self.triggering_frequency: 910 raise ValueError( 911 'triggering_frequency can only be used with file' 912 'loads in streaming') 913 if not self.is_streaming_pipeline and self.with_auto_sharding: 914 return ValueError( 915 'with_auto_sharding can only be used with file loads in streaming.') 916 917 def _window_fn(self): 918 """Set the correct WindowInto PTransform""" 919 920 # The user-supplied triggering_frequency is often chosen to control how 921 # many BigQuery load jobs are triggered, to prevent going over BigQuery's 922 # daily quota for load jobs. If this is set to a large value, currently we 923 # have to buffer all the data until the trigger fires. Instead we ensure 924 # that the files are written if a threshold number of records are ready. 925 # We use only the user-supplied trigger on the actual BigQuery load. 926 # This allows us to offload the data to the filesystem. 927 # 928 # In the case of dynamic sharding, however, we use a default trigger since 929 # the transform performs sharding also batches elements to avoid generating 930 # too many tiny files. User trigger is applied right after writes to limit 931 # the number of load jobs. 932 if self.is_streaming_pipeline and not self.with_auto_sharding: 933 return beam.WindowInto(beam.window.GlobalWindows(), 934 trigger=trigger.Repeatedly( 935 trigger.AfterAny( 936 trigger.AfterProcessingTime( 937 self.triggering_frequency), 938 trigger.AfterCount( 939 _FILE_TRIGGERING_RECORD_COUNT))), 940 accumulation_mode=trigger.AccumulationMode\ 941 .DISCARDING) 942 else: 943 return beam.WindowInto(beam.window.GlobalWindows()) 944 945 def _maybe_apply_user_trigger(self, destination_file_kv_pc): 946 if self.is_streaming_pipeline: 947 # Apply the user's trigger back before we start triggering load jobs 948 return ( 949 destination_file_kv_pc 950 | "ApplyUserTrigger" >> beam.WindowInto( 951 beam.window.GlobalWindows(), 952 trigger=trigger.Repeatedly( 953 trigger.AfterAll( 954 trigger.AfterProcessingTime(self.triggering_frequency), 955 trigger.AfterCount(1))), 956 accumulation_mode=trigger.AccumulationMode.DISCARDING)) 957 else: 958 return destination_file_kv_pc 959 960 def _write_files(self, destination_data_kv_pc, file_prefix_pcv): 961 outputs = ( 962 destination_data_kv_pc 963 | beam.ParDo( 964 WriteRecordsToFile( 965 schema=self.schema, 966 max_files_per_bundle=self.max_files_per_bundle, 967 max_file_size=self.max_file_size, 968 file_format=self._temp_file_format), 969 file_prefix_pcv, 970 *self.schema_side_inputs).with_outputs( 971 WriteRecordsToFile.UNWRITTEN_RECORD_TAG, 972 WriteRecordsToFile.WRITTEN_FILE_TAG)) 973 974 # A PCollection of (destination, file) tuples. It lists files with records, 975 # and the destination each file is meant to be imported into. 976 destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG] 977 978 # A PCollection of (destination, record) tuples. These are later sharded, 979 # grouped, and all records for each destination-shard is written to files. 980 # This PCollection is necessary because not all records can be written into 981 # files in ``WriteRecordsToFile``. 982 unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG] 983 984 more_destination_files_kv_pc = ( 985 unwritten_records_pc 986 | beam.ParDo(_ShardDestinations()) 987 | "GroupShardedRows" >> beam.GroupByKey() 988 | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1])) 989 | "WriteGroupedRecordsToFile" >> beam.ParDo( 990 WriteGroupedRecordsToFile( 991 schema=self.schema, file_format=self._temp_file_format), 992 file_prefix_pcv, 993 *self.schema_side_inputs)) 994 995 # TODO(https://github.com/apache/beam/issues/20285): Remove the identity 996 # transform. We flatten both PCollection paths and use an identity function 997 # to work around a flatten optimization issue where the wrong coder is 998 # being used. 999 all_destination_file_pairs_pc = ( 1000 (destination_files_kv_pc, more_destination_files_kv_pc) 1001 | "DestinationFilesUnion" >> beam.Flatten() 1002 | "IdentityWorkaround" >> beam.Map(lambda x: x)) 1003 return self._maybe_apply_user_trigger(all_destination_file_pairs_pc) 1004 1005 def _write_files_with_auto_sharding( 1006 self, destination_data_kv_pc, file_prefix_pcv): 1007 clock = self.test_client.test_clock if self.test_client else time.time 1008 1009 # Auto-sharding is achieved via GroupIntoBatches.WithShardedKey 1010 # transform which shards, groups and at the same time batches the table rows 1011 # to be inserted to BigQuery. 1012 1013 # Firstly, the keys of tagged_data (table references) are converted to a 1014 # hashable format. This is needed to work with the keyed states used by. 1015 # GroupIntoBatches. After grouping and batching is done, table references 1016 # are restored. 1017 destination_files_kv_pc = ( 1018 destination_data_kv_pc 1019 | 1020 'ToHashableTableRef' >> beam.Map(bigquery_tools.to_hashable_table_ref) 1021 | 'WithAutoSharding' >> GroupIntoBatches.WithShardedKey( 1022 batch_size=_FILE_TRIGGERING_RECORD_COUNT, 1023 max_buffering_duration_secs=_FILE_TRIGGERING_BATCHING_DURATION_SECS, 1024 clock=clock) 1025 | 'FromHashableTableRefAndDropShard' >> beam.Map( 1026 lambda kvs: 1027 (bigquery_tools.parse_table_reference(kvs[0].key), kvs[1])) 1028 | beam.ParDo( 1029 WriteGroupedRecordsToFile( 1030 schema=self.schema, file_format=self._temp_file_format), 1031 file_prefix_pcv, 1032 *self.schema_side_inputs)) 1033 1034 return self._maybe_apply_user_trigger(destination_files_kv_pc) 1035 1036 def _load_data( 1037 self, 1038 partitions_using_temp_tables, 1039 partitions_direct_to_destination, 1040 load_job_name_pcv, 1041 schema_mod_job_name_pcv, 1042 copy_job_name_pcv, 1043 p, 1044 step_name): 1045 """Load data to BigQuery 1046 1047 Data is loaded into BigQuery in the following two ways: 1048 1. Single partition: 1049 When there is a single partition of files destined to a single 1050 destination, a single load job is triggered. 1051 2. Multiple partitions and/or Dynamic Destinations: 1052 When there are multiple partitions of files destined for a single 1053 destination or when Dynamic Destinations are used, multiple load jobs 1054 need to be triggered for each partition/destination. Load Jobs are 1055 triggered to temporary tables, and those are later copied to the actual 1056 appropriate destination table. This ensures atomicity when only some 1057 of the load jobs would fail but not other. If any of them fails, then 1058 copy jobs are not triggered. 1059 """ 1060 # Load data using temp tables 1061 trigger_loads_outputs = ( 1062 partitions_using_temp_tables 1063 | "TriggerLoadJobsWithTempTables" >> beam.ParDo( 1064 TriggerLoadJobs( 1065 schema=self.schema, 1066 project=self.project, 1067 write_disposition=self.write_disposition, 1068 create_disposition=self.create_disposition, 1069 test_client=self.test_client, 1070 temporary_tables=True, 1071 additional_bq_parameters=self.additional_bq_parameters, 1072 source_format=self._temp_file_format, 1073 step_name=step_name, 1074 load_job_project_id=self.load_job_project_id), 1075 load_job_name_pcv, 1076 *self.schema_side_inputs).with_outputs( 1077 TriggerLoadJobs.TEMP_TABLES, 1078 TriggerLoadJobs.ONGOING_JOBS, 1079 main='main')) 1080 1081 finished_temp_tables_load_job_ids_pc = trigger_loads_outputs['main'] 1082 temp_tables_load_job_ids_pc = trigger_loads_outputs[ 1083 TriggerLoadJobs.ONGOING_JOBS] 1084 temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES] 1085 1086 schema_mod_job_ids_pc = ( 1087 finished_temp_tables_load_job_ids_pc 1088 | beam.ParDo( 1089 UpdateDestinationSchema( 1090 project=self.project, 1091 write_disposition=self.write_disposition, 1092 test_client=self.test_client, 1093 additional_bq_parameters=self.additional_bq_parameters, 1094 step_name=step_name, 1095 load_job_project_id=self.load_job_project_id), 1096 schema_mod_job_name_pcv)) 1097 1098 if self.write_disposition in ('WRITE_EMPTY', 'WRITE_TRUNCATE'): 1099 # All loads going to the same table must be processed together so that 1100 # the truncation happens only once. See 1101 # https://github.com/apache/beam/issues/24535. 1102 finished_temp_tables_load_job_ids_list_pc = ( 1103 finished_temp_tables_load_job_ids_pc | beam.MapTuple( 1104 lambda destination, 1105 job_reference: ( 1106 bigquery_tools.parse_table_reference(destination).tableId, 1107 (destination, job_reference))) 1108 | beam.GroupByKey() 1109 | beam.MapTuple(lambda tableId, batch: list(batch))) 1110 else: 1111 # Loads can happen in parallel. 1112 finished_temp_tables_load_job_ids_list_pc = ( 1113 finished_temp_tables_load_job_ids_pc | beam.Map(lambda x: [x])) 1114 1115 copy_job_outputs = ( 1116 finished_temp_tables_load_job_ids_list_pc 1117 | beam.ParDo( 1118 TriggerCopyJobs( 1119 project=self.project, 1120 create_disposition=self.create_disposition, 1121 write_disposition=self.write_disposition, 1122 test_client=self.test_client, 1123 step_name=step_name, 1124 load_job_project_id=self.load_job_project_id), 1125 copy_job_name_pcv, 1126 pvalue.AsIter(schema_mod_job_ids_pc)).with_outputs( 1127 TriggerCopyJobs.TRIGGER_DELETE_TEMP_TABLES, main='main')) 1128 1129 destination_copy_job_ids_pc = copy_job_outputs['main'] 1130 trigger_delete = copy_job_outputs[ 1131 TriggerCopyJobs.TRIGGER_DELETE_TEMP_TABLES] 1132 1133 _ = ( 1134 temp_tables_pc 1135 | "RemoveTempTables/AddUselessValue" >> beam.Map( 1136 lambda x, unused_trigger: (x, None), pvalue.AsList(trigger_delete)) 1137 | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey() 1138 | "RemoveTempTables/GetTableNames" >> beam.Keys() 1139 | "RemoveTempTables/Delete" >> beam.ParDo( 1140 DeleteTablesFn(self.test_client))) 1141 1142 # Load data directly to destination table 1143 destination_load_job_ids_pc = ( 1144 partitions_direct_to_destination 1145 | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo( 1146 TriggerLoadJobs( 1147 schema=self.schema, 1148 write_disposition=self.write_disposition, 1149 create_disposition=self.create_disposition, 1150 test_client=self.test_client, 1151 temporary_tables=False, 1152 additional_bq_parameters=self.additional_bq_parameters, 1153 source_format=self._temp_file_format, 1154 step_name=step_name, 1155 load_job_project_id=self.load_job_project_id), 1156 load_job_name_pcv, 1157 *self.schema_side_inputs).with_outputs( 1158 TriggerLoadJobs.ONGOING_JOBS, main='main') 1159 )[TriggerLoadJobs.ONGOING_JOBS] 1160 1161 destination_load_job_ids_pc = ( 1162 (temp_tables_load_job_ids_pc, destination_load_job_ids_pc) 1163 | beam.Flatten()) 1164 1165 return destination_load_job_ids_pc, destination_copy_job_ids_pc 1166 1167 def expand(self, pcoll): 1168 p = pcoll.pipeline 1169 self.project = self.project or p.options.view_as(GoogleCloudOptions).project 1170 try: 1171 step_name = self.label 1172 except AttributeError: 1173 step_name = 'BigQueryBatchFileLoads_%d' % BigQueryBatchFileLoads.COUNT 1174 BigQueryBatchFileLoads.COUNT += 1 1175 1176 temp_location = p.options.view_as(GoogleCloudOptions).temp_location 1177 job_name = ( 1178 p.options.view_as(GoogleCloudOptions).job_name or 'AUTOMATIC_JOB_NAME') 1179 1180 empty_pc = p | "ImpulseEmptyPC" >> beam.Create([]) 1181 singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None]) 1182 1183 load_job_name_pcv = pvalue.AsSingleton( 1184 singleton_pc 1185 | "LoadJobNamePrefix" >> beam.Map( 1186 lambda _: _generate_job_name( 1187 job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'LOAD_STEP'))) 1188 1189 schema_mod_job_name_pcv = pvalue.AsSingleton( 1190 singleton_pc 1191 | "SchemaModJobNamePrefix" >> beam.Map( 1192 lambda _: _generate_job_name( 1193 job_name, 1194 bigquery_tools.BigQueryJobTypes.LOAD, 1195 'SCHEMA_MOD_STEP'))) 1196 1197 copy_job_name_pcv = pvalue.AsSingleton( 1198 singleton_pc 1199 | "CopyJobNamePrefix" >> beam.Map( 1200 lambda _: _generate_job_name( 1201 job_name, bigquery_tools.BigQueryJobTypes.COPY, 'COPY_STEP'))) 1202 1203 file_prefix_pcv = pvalue.AsSingleton( 1204 singleton_pc 1205 | "GenerateFilePrefix" >> beam.Map( 1206 file_prefix_generator( 1207 self._validate, self._custom_gcs_temp_location, temp_location))) 1208 1209 destination_data_kv_pc = ( 1210 pcoll 1211 | "RewindowIntoGlobal" >> self._window_fn() 1212 | "AppendDestination" >> beam.ParDo( 1213 bigquery_tools.AppendDestinationsFn(self.destination), 1214 *self.table_side_inputs)) 1215 1216 if not self.with_auto_sharding: 1217 all_destination_file_pairs_pc = self._write_files( 1218 destination_data_kv_pc, file_prefix_pcv) 1219 else: 1220 all_destination_file_pairs_pc = self._write_files_with_auto_sharding( 1221 destination_data_kv_pc, file_prefix_pcv) 1222 1223 grouped_files_pc = ( 1224 all_destination_file_pairs_pc 1225 | "GroupFilesByTableDestinations" >> beam.GroupByKey()) 1226 1227 partitions = ( 1228 grouped_files_pc 1229 | beam.ParDo( 1230 PartitionFiles( 1231 self.max_partition_size, 1232 self.max_files_per_partition)).with_outputs( 1233 PartitionFiles.MULTIPLE_PARTITIONS_TAG, 1234 PartitionFiles.SINGLE_PARTITION_TAG)) 1235 1236 multiple_partitions_per_destination_pc = partitions[ 1237 PartitionFiles.MULTIPLE_PARTITIONS_TAG] 1238 single_partition_per_destination_pc = partitions[ 1239 PartitionFiles.SINGLE_PARTITION_TAG] 1240 1241 # When using dynamic destinations, elements with both single as well as 1242 # multiple partitions are loaded into BigQuery using temporary tables to 1243 # ensure atomicity. 1244 if self.dynamic_destinations: 1245 all_partitions = (( 1246 multiple_partitions_per_destination_pc, 1247 single_partition_per_destination_pc) 1248 | "FlattenPartitions" >> beam.Flatten()) 1249 destination_load_job_ids_pc, destination_copy_job_ids_pc = ( 1250 self._load_data(all_partitions, 1251 empty_pc, 1252 load_job_name_pcv, 1253 schema_mod_job_name_pcv, 1254 copy_job_name_pcv, 1255 p, 1256 step_name)) 1257 else: 1258 destination_load_job_ids_pc, destination_copy_job_ids_pc = ( 1259 self._load_data(multiple_partitions_per_destination_pc, 1260 single_partition_per_destination_pc, 1261 load_job_name_pcv, 1262 schema_mod_job_name_pcv, 1263 copy_job_name_pcv, 1264 p, 1265 step_name)) 1266 1267 return { 1268 self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc, 1269 self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc, 1270 self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc, 1271 }