github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_tools.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Tools used by BigQuery sources and sinks. 19 20 Classes, constants and functions in this file are experimental and have no 21 backwards compatibility guarantees. 22 23 These tools include wrappers and clients to interact with BigQuery APIs. 24 25 NOTHING IN THIS FILE HAS BACKWARDS COMPATIBILITY GUARANTEES. 26 """ 27 28 # pytype: skip-file 29 30 import datetime 31 import decimal 32 import io 33 import json 34 import logging 35 import sys 36 import time 37 import uuid 38 from json.decoder import JSONDecodeError 39 from typing import Optional 40 from typing import Sequence 41 from typing import Tuple 42 from typing import TypeVar 43 from typing import Union 44 45 import fastavro 46 import numpy as np 47 import regex 48 49 import apache_beam 50 from apache_beam import coders 51 from apache_beam.internal.gcp import auth 52 from apache_beam.internal.gcp.json_value import from_json_value 53 from apache_beam.internal.http_client import get_new_http 54 from apache_beam.internal.metrics.metric import MetricLogger 55 from apache_beam.internal.metrics.metric import Metrics 56 from apache_beam.internal.metrics.metric import ServiceCallMetric 57 from apache_beam.io.gcp import bigquery_avro_tools 58 from apache_beam.io.gcp import resource_identifiers 59 from apache_beam.io.gcp.internal.clients import bigquery 60 from apache_beam.metrics import monitoring_infos 61 from apache_beam.options import value_provider 62 from apache_beam.options.pipeline_options import PipelineOptions 63 from apache_beam.transforms import DoFn 64 from apache_beam.typehints.row_type import RowTypeConstraint 65 from apache_beam.typehints.typehints import Any 66 from apache_beam.utils import retry 67 from apache_beam.utils.histogram import LinearBucket 68 69 # Protect against environments where bigquery library is not available. 70 # pylint: disable=wrong-import-order, wrong-import-position 71 try: 72 from apitools.base.py.transfer import Upload 73 from apitools.base.py.exceptions import HttpError, HttpForbiddenError 74 from google.api_core.exceptions import ClientError, GoogleAPICallError 75 from google.api_core.client_info import ClientInfo 76 from google.cloud import bigquery as gcp_bigquery 77 except ImportError: 78 gcp_bigquery = None 79 pass 80 81 try: 82 from orjson import dumps as fast_json_dumps 83 from orjson import loads as fast_json_loads 84 except ImportError: 85 fast_json_dumps = json.dumps 86 fast_json_loads = json.loads 87 88 # pylint: enable=wrong-import-order, wrong-import-position 89 90 # pylint: disable=wrong-import-order, wrong-import-position, ungrouped-imports 91 try: 92 from apache_beam.io.gcp.internal.clients.bigquery import TableReference 93 except ImportError: 94 TableReference = None 95 # pylint: enable=wrong-import-order, wrong-import-position, ungrouped-imports 96 97 _LOGGER = logging.getLogger(__name__) 98 99 JSON_COMPLIANCE_ERROR = 'NAN, INF and -INF values are not JSON compliant.' 100 MAX_RETRIES = 3 101 UNKNOWN_MIME_TYPE = 'application/octet-stream' 102 103 # Timeout for a BQ streaming insert RPC. Set to a maximum of 2 minutes. 104 BQ_STREAMING_INSERT_TIMEOUT_SEC = 120 105 106 _PROJECT_PATTERN = r'([a-z0-9.-]+:)?[a-z][a-z0-9-]*[a-z0-9]' 107 _DATASET_PATTERN = r'\w{1,1024}' 108 _TABLE_PATTERN = r'[\p{L}\p{M}\p{N}\p{Pc}\p{Pd}\p{Zs}$]{1,1024}' 109 110 # TODO(https://github.com/apache/beam/issues/25946): Add support for 111 # more Beam portable schema types as Python types 112 BIGQUERY_TYPE_TO_PYTHON_TYPE = { 113 "STRING": str, 114 "BOOL": bool, 115 "BOOLEAN": bool, 116 "BYTES": bytes, 117 "INT64": np.int64, 118 "INTEGER": np.int64, 119 "FLOAT64": np.float64, 120 "FLOAT": np.float64, 121 "NUMERIC": decimal.Decimal, 122 "TIMESTAMP": apache_beam.utils.timestamp.Timestamp, 123 } 124 125 126 class FileFormat(object): 127 CSV = 'CSV' 128 JSON = 'NEWLINE_DELIMITED_JSON' 129 AVRO = 'AVRO' 130 131 132 class ExportCompression(object): 133 GZIP = 'GZIP' 134 DEFLATE = 'DEFLATE' 135 SNAPPY = 'SNAPPY' 136 NONE = 'NONE' 137 138 139 def default_encoder(obj): 140 if isinstance(obj, decimal.Decimal): 141 return str(obj) 142 elif isinstance(obj, bytes): 143 # on python 3 base64-encoded bytes are decoded to strings 144 # before being sent to BigQuery 145 return obj.decode('utf-8') 146 elif isinstance(obj, (datetime.date, datetime.time)): 147 return str(obj) 148 elif isinstance(obj, datetime.datetime): 149 return obj.isoformat() 150 151 _LOGGER.error("Unable to serialize %r to JSON", obj) 152 raise TypeError( 153 "Object of type '%s' is not JSON serializable" % type(obj).__name__) 154 155 156 def get_hashable_destination(destination): 157 """Parses a table reference into a (project, dataset, table) tuple. 158 159 Args: 160 destination: Either a TableReference object from the bigquery API. 161 The object has the following attributes: projectId, datasetId, and 162 tableId. Or a string representing the destination containing 163 'PROJECT:DATASET.TABLE'. 164 Returns: 165 A string representing the destination containing 166 'PROJECT:DATASET.TABLE'. 167 """ 168 if isinstance(destination, TableReference): 169 return '%s:%s.%s' % ( 170 destination.projectId, destination.datasetId, destination.tableId) 171 else: 172 return destination 173 174 175 V = TypeVar('V') 176 177 178 def to_hashable_table_ref( 179 table_ref_elem_kv: Tuple[Union[str, TableReference], V]) -> Tuple[str, V]: 180 """Turns the key of the input tuple to its string representation. The key 181 should be either a string or a TableReference. 182 183 Args: 184 table_ref_elem_kv: A tuple of table reference and element. 185 186 Returns: 187 A tuple of string representation of input table and input element. 188 """ 189 table_ref = table_ref_elem_kv[0] 190 hashable_table_ref = get_hashable_destination(table_ref) 191 return (hashable_table_ref, table_ref_elem_kv[1]) 192 193 194 def parse_table_schema_from_json(schema_string): 195 """Parse the Table Schema provided as string. 196 197 Args: 198 schema_string: String serialized table schema, should be a valid JSON. 199 200 Returns: 201 A TableSchema of the BigQuery export from either the Query or the Table. 202 """ 203 try: 204 json_schema = json.loads(schema_string) 205 except JSONDecodeError as e: 206 raise ValueError( 207 'Unable to parse JSON schema: %s - %r' % (schema_string, e)) 208 209 def _parse_schema_field(field): 210 """Parse a single schema field from dictionary. 211 212 Args: 213 field: Dictionary object containing serialized schema. 214 215 Returns: 216 A TableFieldSchema for a single column in BigQuery. 217 """ 218 schema = bigquery.TableFieldSchema() 219 schema.name = field['name'] 220 schema.type = field['type'] 221 if 'mode' in field: 222 schema.mode = field['mode'] 223 else: 224 schema.mode = 'NULLABLE' 225 if 'description' in field: 226 schema.description = field['description'] 227 if 'fields' in field: 228 schema.fields = [_parse_schema_field(x) for x in field['fields']] 229 return schema 230 231 fields = [_parse_schema_field(f) for f in json_schema['fields']] 232 return bigquery.TableSchema(fields=fields) 233 234 235 def parse_table_reference(table, dataset=None, project=None): 236 """Parses a table reference into a (project, dataset, table) tuple. 237 238 Args: 239 table: The ID of the table. The ID must contain only letters 240 (a-z, A-Z), numbers (0-9), connectors (-_). If dataset argument is None 241 then the table argument must contain the entire table reference: 242 'DATASET.TABLE' or 'PROJECT:DATASET.TABLE'. This argument can be a 243 TableReference instance in which case dataset and project are 244 ignored and the reference is returned as a result. Additionally, for date 245 partitioned tables, appending '$YYYYmmdd' to the table name is supported, 246 e.g. 'DATASET.TABLE$YYYYmmdd'. 247 dataset: The ID of the dataset containing this table or null if the table 248 reference is specified entirely by the table argument. 249 project: The ID of the project containing this table or null if the table 250 reference is specified entirely by the table (and possibly dataset) 251 argument. 252 253 Returns: 254 A TableReference object from the bigquery API. The object has the following 255 attributes: projectId, datasetId, and tableId. 256 If the input is a TableReference object, a new object will be returned. 257 258 Raises: 259 ValueError: if the table reference as a string does not match the expected 260 format. 261 """ 262 263 if isinstance(table, TableReference): 264 return TableReference( 265 projectId=table.projectId, 266 datasetId=table.datasetId, 267 tableId=table.tableId) 268 elif callable(table): 269 return table 270 elif isinstance(table, value_provider.ValueProvider): 271 return table 272 273 table_reference = TableReference() 274 # If dataset argument is not specified, the expectation is that the 275 # table argument will contain a full table reference instead of just a 276 # table name. 277 if dataset is None: 278 pattern = ( 279 f'((?P<project>{_PROJECT_PATTERN})[:\\.])?' 280 f'(?P<dataset>{_DATASET_PATTERN})\\.(?P<table>{_TABLE_PATTERN})') 281 match = regex.fullmatch(pattern, table) 282 if not match: 283 raise ValueError( 284 'Expected a table reference (PROJECT:DATASET.TABLE or ' 285 'DATASET.TABLE) instead of %s.' % table) 286 table_reference.projectId = match.group('project') 287 table_reference.datasetId = match.group('dataset') 288 table_reference.tableId = match.group('table') 289 else: 290 table_reference.projectId = project 291 table_reference.datasetId = dataset 292 table_reference.tableId = table 293 return table_reference 294 295 296 # ----------------------------------------------------------------------------- 297 # BigQueryWrapper. 298 299 300 def _build_job_labels(input_labels): 301 """Builds job label protobuf structure.""" 302 input_labels = input_labels or {} 303 result = bigquery.JobConfiguration.LabelsValue() 304 305 for k, v in input_labels.items(): 306 result.additionalProperties.append( 307 bigquery.JobConfiguration.LabelsValue.AdditionalProperty( 308 key=k, 309 value=v, 310 )) 311 return result 312 313 314 def _build_dataset_labels(input_labels): 315 """Builds dataset label protobuf structure.""" 316 input_labels = input_labels or {} 317 result = bigquery.Dataset.LabelsValue() 318 319 for k, v in input_labels.items(): 320 result.additionalProperties.append( 321 bigquery.Dataset.LabelsValue.AdditionalProperty( 322 key=k, 323 value=v, 324 )) 325 return result 326 327 328 def _build_filter_from_labels(labels): 329 filter_str = '' 330 for key, value in labels.items(): 331 filter_str += 'labels.' + key + ':' + value + ' ' 332 return filter_str 333 334 335 class BigQueryWrapper(object): 336 """BigQuery client wrapper with utilities for querying. 337 338 The wrapper is used to organize all the BigQuery integration points and 339 offer a common place where retry logic for failures can be controlled. 340 In addition, it offers various functions used both in sources and sinks 341 (e.g., find and create tables, query a table, etc.). 342 """ 343 344 # If updating following names, also update the corresponding pydocs in 345 # bigquery.py. 346 TEMP_TABLE = 'beam_temp_table_' 347 TEMP_DATASET = 'beam_temp_dataset_' 348 349 HISTOGRAM_METRIC_LOGGER = MetricLogger() 350 351 def __init__(self, client=None, temp_dataset_id=None, temp_table_ref=None): 352 self.client = client or bigquery.BigqueryV2( 353 http=get_new_http(), 354 credentials=auth.get_service_credentials(PipelineOptions()), 355 response_encoding='utf8', 356 additional_http_headers={ 357 "user-agent": "apache-beam-%s" % apache_beam.__version__ 358 }) 359 self.gcp_bq_client = client or gcp_bigquery.Client( 360 client_info=ClientInfo( 361 user_agent="apache-beam-%s" % apache_beam.__version__)) 362 self._unique_row_id = 0 363 # For testing scenarios where we pass in a client we do not want a 364 # randomized prefix for row IDs. 365 self._row_id_prefix = '' if client else uuid.uuid4() 366 self._latency_histogram_metric = Metrics.histogram( 367 self.__class__, 368 'latency_histogram_ms', 369 LinearBucket(0, 20, 3000), 370 BigQueryWrapper.HISTOGRAM_METRIC_LOGGER) 371 372 if temp_dataset_id is not None and temp_table_ref is not None: 373 raise ValueError( 374 'Both a BigQuery temp_dataset_id and a temp_table_ref were specified.' 375 ' Please specify only one of these.') 376 377 if temp_dataset_id and temp_dataset_id.startswith(self.TEMP_DATASET): 378 raise ValueError( 379 'User provided temp dataset ID cannot start with %r' % 380 self.TEMP_DATASET) 381 382 if temp_table_ref is not None: 383 self.temp_table_ref = temp_table_ref 384 self.temp_dataset_id = temp_table_ref.datasetId 385 else: 386 self.temp_table_ref = None 387 self._temporary_table_suffix = uuid.uuid4().hex 388 self.temp_dataset_id = temp_dataset_id or self._get_temp_dataset() 389 390 self.created_temp_dataset = False 391 392 @property 393 def unique_row_id(self): 394 """Returns a unique row ID (str) used to avoid multiple insertions. 395 396 If the row ID is provided, BigQuery will make a best effort to not insert 397 the same row multiple times for fail and retry scenarios in which the insert 398 request may be issued several times. This comes into play for sinks executed 399 in a local runner. 400 401 Returns: 402 a unique row ID string 403 """ 404 self._unique_row_id += 1 405 return '%s_%d' % (self._row_id_prefix, self._unique_row_id) 406 407 def _get_temp_table(self, project_id): 408 if self.temp_table_ref: 409 return self.temp_table_ref 410 411 return parse_table_reference( 412 table=BigQueryWrapper.TEMP_TABLE + self._temporary_table_suffix, 413 dataset=self.temp_dataset_id, 414 project=project_id) 415 416 def _get_temp_dataset(self): 417 if self.temp_table_ref: 418 return self.temp_table_ref.datasetId 419 return BigQueryWrapper.TEMP_DATASET + self._temporary_table_suffix 420 421 @retry.with_exponential_backoff( 422 num_retries=MAX_RETRIES, 423 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 424 def get_query_location(self, project_id, query, use_legacy_sql): 425 """ 426 Get the location of tables referenced in a query. 427 428 This method returns the location of the first available referenced 429 table for user in the query and depends on the BigQuery service to 430 provide error handling for queries that reference tables in multiple 431 locations. 432 """ 433 reference = bigquery.JobReference( 434 jobId=uuid.uuid4().hex, projectId=project_id) 435 request = bigquery.BigqueryJobsInsertRequest( 436 projectId=project_id, 437 job=bigquery.Job( 438 configuration=bigquery.JobConfiguration( 439 dryRun=True, 440 query=bigquery.JobConfigurationQuery( 441 query=query, 442 useLegacySql=use_legacy_sql, 443 )), 444 jobReference=reference)) 445 446 response = self.client.jobs.Insert(request) 447 448 if response.statistics is None: 449 # This behavior is only expected in tests 450 _LOGGER.warning( 451 "Unable to get location, missing response.statistics. Query: %s", 452 query) 453 return None 454 455 referenced_tables = response.statistics.query.referencedTables 456 if referenced_tables: # Guards against both non-empty and non-None 457 for table in referenced_tables: 458 try: 459 location = self.get_table_location( 460 table.projectId, table.datasetId, table.tableId) 461 except HttpForbiddenError: 462 # Permission access for table (i.e. from authorized_view), 463 # try next one 464 continue 465 _LOGGER.info( 466 "Using location %r from table %r referenced by query %s", 467 location, 468 table, 469 query) 470 return location 471 472 _LOGGER.debug( 473 "Query %s does not reference any tables or " 474 "you don't have permission to inspect them.", 475 query) 476 return None 477 478 @retry.with_exponential_backoff( 479 num_retries=MAX_RETRIES, 480 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 481 def _insert_copy_job( 482 self, 483 project_id, 484 job_id, 485 from_table_reference, 486 to_table_reference, 487 create_disposition=None, 488 write_disposition=None, 489 job_labels=None): 490 reference = bigquery.JobReference() 491 reference.jobId = job_id 492 reference.projectId = project_id 493 request = bigquery.BigqueryJobsInsertRequest( 494 projectId=project_id, 495 job=bigquery.Job( 496 configuration=bigquery.JobConfiguration( 497 copy=bigquery.JobConfigurationTableCopy( 498 destinationTable=to_table_reference, 499 sourceTable=from_table_reference, 500 createDisposition=create_disposition, 501 writeDisposition=write_disposition, 502 ), 503 labels=_build_job_labels(job_labels), 504 ), 505 jobReference=reference, 506 )) 507 508 return self._start_job(request).jobReference 509 510 @retry.with_exponential_backoff( 511 num_retries=MAX_RETRIES, 512 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 513 def _insert_load_job( 514 self, 515 project_id, 516 job_id, 517 table_reference, 518 source_uris=None, 519 source_stream=None, 520 schema=None, 521 write_disposition=None, 522 create_disposition=None, 523 additional_load_parameters=None, 524 source_format=None, 525 job_labels=None): 526 527 if not source_uris and not source_stream: 528 _LOGGER.warning( 529 'Both source URIs and source stream are not provided. BigQuery load ' 530 'job will not load any data.') 531 532 if source_uris and source_stream: 533 raise ValueError( 534 'Only one of source_uris and source_stream may be specified. ' 535 'Got both.') 536 537 if source_uris is None: 538 source_uris = [] 539 540 additional_load_parameters = additional_load_parameters or {} 541 job_schema = None if schema == 'SCHEMA_AUTODETECT' else schema 542 reference = bigquery.JobReference(jobId=job_id, projectId=project_id) 543 request = bigquery.BigqueryJobsInsertRequest( 544 projectId=project_id, 545 job=bigquery.Job( 546 configuration=bigquery.JobConfiguration( 547 load=bigquery.JobConfigurationLoad( 548 sourceUris=source_uris, 549 destinationTable=table_reference, 550 schema=job_schema, 551 writeDisposition=write_disposition, 552 createDisposition=create_disposition, 553 sourceFormat=source_format, 554 useAvroLogicalTypes=True, 555 autodetect=schema == 'SCHEMA_AUTODETECT', 556 **additional_load_parameters), 557 labels=_build_job_labels(job_labels), 558 ), 559 jobReference=reference, 560 )) 561 return self._start_job(request, stream=source_stream).jobReference 562 563 def _start_job( 564 self, 565 request, # type: bigquery.BigqueryJobsInsertRequest 566 stream=None, 567 ): 568 """Inserts a BigQuery job. 569 570 If the job exists already, it returns it. 571 572 Args: 573 request (bigquery.BigqueryJobsInsertRequest): An insert job request. 574 stream (IO[bytes]): A bytes IO object open for reading. 575 """ 576 try: 577 upload = None 578 if stream: 579 upload = Upload.FromStream(stream, mime_type=UNKNOWN_MIME_TYPE) 580 response = self.client.jobs.Insert(request, upload=upload) 581 _LOGGER.info( 582 "Started BigQuery job: %s\n " 583 "bq show -j --format=prettyjson --project_id=%s %s", 584 response.jobReference, 585 response.jobReference.projectId, 586 response.jobReference.jobId) 587 return response 588 except HttpError as exn: 589 if exn.status_code == 409: 590 _LOGGER.info( 591 "BigQuery job %s already exists, will not retry inserting it: %s", 592 request.job.jobReference, 593 exn) 594 return request.job 595 else: 596 _LOGGER.info( 597 "Failed to insert job %s: %s", request.job.jobReference, exn) 598 raise 599 600 @retry.with_exponential_backoff( 601 num_retries=MAX_RETRIES, 602 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 603 def _start_query_job( 604 self, 605 project_id, 606 query, 607 use_legacy_sql, 608 flatten_results, 609 job_id, 610 priority, 611 dry_run=False, 612 kms_key=None, 613 job_labels=None): 614 reference = bigquery.JobReference(jobId=job_id, projectId=project_id) 615 request = bigquery.BigqueryJobsInsertRequest( 616 projectId=project_id, 617 job=bigquery.Job( 618 configuration=bigquery.JobConfiguration( 619 dryRun=dry_run, 620 query=bigquery.JobConfigurationQuery( 621 query=query, 622 useLegacySql=use_legacy_sql, 623 allowLargeResults=not dry_run, 624 destinationTable=self._get_temp_table(project_id) 625 if not dry_run else None, 626 flattenResults=flatten_results, 627 priority=priority, 628 destinationEncryptionConfiguration=bigquery. 629 EncryptionConfiguration(kmsKeyName=kms_key)), 630 labels=_build_job_labels(job_labels), 631 ), 632 jobReference=reference)) 633 634 return self._start_job(request) 635 636 def wait_for_bq_job(self, job_reference, sleep_duration_sec=5, max_retries=0): 637 """Poll job until it is DONE. 638 639 Args: 640 job_reference: bigquery.JobReference instance. 641 sleep_duration_sec: Specifies the delay in seconds between retries. 642 max_retries: The total number of times to retry. If equals to 0, 643 the function waits forever. 644 645 Raises: 646 `RuntimeError`: If the job is FAILED or the number of retries has been 647 reached. 648 """ 649 retry = 0 650 while True: 651 retry += 1 652 job = self.get_job( 653 job_reference.projectId, job_reference.jobId, job_reference.location) 654 logging.info('Job status: %s', job.status.state) 655 if job.status.state == 'DONE' and job.status.errorResult: 656 raise RuntimeError( 657 'BigQuery job {} failed. Error Result: {}'.format( 658 job_reference.jobId, job.status.errorResult)) 659 elif job.status.state == 'DONE': 660 return True 661 else: 662 time.sleep(sleep_duration_sec) 663 if max_retries != 0 and retry >= max_retries: 664 raise RuntimeError('The maximum number of retries has been reached') 665 666 @retry.with_exponential_backoff( 667 num_retries=MAX_RETRIES, 668 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 669 def _get_query_results( 670 self, 671 project_id, 672 job_id, 673 page_token=None, 674 max_results=10000, 675 location=None): 676 request = bigquery.BigqueryJobsGetQueryResultsRequest( 677 jobId=job_id, 678 pageToken=page_token, 679 projectId=project_id, 680 maxResults=max_results, 681 location=location) 682 response = self.client.jobs.GetQueryResults(request) 683 return response 684 685 @retry.with_exponential_backoff( 686 num_retries=MAX_RETRIES, 687 retry_filter=retry.retry_on_server_errors_timeout_or_quota_issues_filter) 688 def _insert_all_rows( 689 self, 690 project_id, 691 dataset_id, 692 table_id, 693 rows, 694 insert_ids, 695 skip_invalid_rows=False, 696 ignore_unknown_values=False): 697 """Calls the insertAll BigQuery API endpoint. 698 699 Docs for this BQ call: https://cloud.google.com/bigquery/docs/reference\ 700 /rest/v2/tabledata/insertAll.""" 701 # The rows argument is a list of 702 # bigquery.TableDataInsertAllRequest.RowsValueListEntry instances as 703 # required by the InsertAll() method. 704 resource = resource_identifiers.BigQueryTable( 705 project_id, dataset_id, table_id) 706 707 labels = { 708 # TODO(ajamato): Add Ptransform label. 709 monitoring_infos.SERVICE_LABEL: 'BigQuery', 710 # Refer to any method which writes elements to BigQuery in batches 711 # as "BigQueryBatchWrite". I.e. storage API's insertAll, or future 712 # APIs introduced. 713 monitoring_infos.METHOD_LABEL: 'BigQueryBatchWrite', 714 monitoring_infos.RESOURCE_LABEL: resource, 715 monitoring_infos.BIGQUERY_PROJECT_ID_LABEL: project_id, 716 monitoring_infos.BIGQUERY_DATASET_LABEL: dataset_id, 717 monitoring_infos.BIGQUERY_TABLE_LABEL: table_id, 718 } 719 service_call_metric = ServiceCallMetric( 720 request_count_urn=monitoring_infos.API_REQUEST_COUNT_URN, 721 base_labels=labels) 722 723 started_millis = int(time.time() * 1000) 724 try: 725 table_ref_str = '%s.%s.%s' % (project_id, dataset_id, table_id) 726 errors = self.gcp_bq_client.insert_rows_json( 727 table_ref_str, 728 json_rows=rows, 729 row_ids=insert_ids, 730 skip_invalid_rows=skip_invalid_rows, 731 ignore_unknown_values=ignore_unknown_values, 732 timeout=BQ_STREAMING_INSERT_TIMEOUT_SEC) 733 if not errors: 734 service_call_metric.call('ok') 735 else: 736 for insert_error in errors: 737 service_call_metric.call(insert_error['errors'][0]) 738 except (ClientError, GoogleAPICallError) as e: 739 # e.code contains the numeric http status code. 740 service_call_metric.call(e.code) 741 # Re-reise the exception so that we re-try appropriately. 742 raise 743 except HttpError as e: 744 service_call_metric.call(e) 745 # Re-reise the exception so that we re-try appropriately. 746 raise 747 finally: 748 self._latency_histogram_metric.update( 749 int(time.time() * 1000) - started_millis) 750 return not errors, errors 751 752 @retry.with_exponential_backoff( 753 num_retries=MAX_RETRIES, 754 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 755 def get_table(self, project_id, dataset_id, table_id): 756 """Lookup a table's metadata object. 757 758 Args: 759 client: bigquery.BigqueryV2 instance 760 project_id: table lookup parameter 761 dataset_id: table lookup parameter 762 table_id: table lookup parameter 763 764 Returns: 765 bigquery.Table instance 766 Raises: 767 HttpError: if lookup failed. 768 """ 769 request = bigquery.BigqueryTablesGetRequest( 770 projectId=project_id, datasetId=dataset_id, tableId=table_id) 771 response = self.client.tables.Get(request) 772 return response 773 774 def _create_table( 775 self, 776 project_id, 777 dataset_id, 778 table_id, 779 schema, 780 additional_parameters=None): 781 782 valid_tablename = regex.fullmatch(_TABLE_PATTERN, table_id, regex.ASCII) 783 if not valid_tablename: 784 raise ValueError( 785 'Invalid BigQuery table name: %s \n' 786 'See https://cloud.google.com/bigquery/docs/tables#table_naming' % 787 table_id) 788 789 additional_parameters = additional_parameters or {} 790 table = bigquery.Table( 791 tableReference=TableReference( 792 projectId=project_id, datasetId=dataset_id, tableId=table_id), 793 schema=schema, 794 **additional_parameters) 795 request = bigquery.BigqueryTablesInsertRequest( 796 projectId=project_id, datasetId=dataset_id, table=table) 797 response = self.client.tables.Insert(request) 798 _LOGGER.debug("Created the table with id %s", table_id) 799 # The response is a bigquery.Table instance. 800 return response 801 802 @retry.with_exponential_backoff( 803 num_retries=MAX_RETRIES, 804 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 805 def get_or_create_dataset( 806 self, project_id, dataset_id, location=None, labels=None): 807 # Check if dataset already exists otherwise create it 808 try: 809 dataset = self.client.datasets.Get( 810 bigquery.BigqueryDatasetsGetRequest( 811 projectId=project_id, datasetId=dataset_id)) 812 self.created_temp_dataset = False 813 return dataset 814 except HttpError as exn: 815 if exn.status_code == 404: 816 _LOGGER.info( 817 'Dataset %s:%s does not exist so we will create it as temporary ' 818 'with location=%s', 819 project_id, 820 dataset_id, 821 location) 822 dataset_reference = bigquery.DatasetReference( 823 projectId=project_id, datasetId=dataset_id) 824 dataset = bigquery.Dataset(datasetReference=dataset_reference) 825 if location is not None: 826 dataset.location = location 827 if labels is not None: 828 dataset.labels = _build_dataset_labels(labels) 829 request = bigquery.BigqueryDatasetsInsertRequest( 830 projectId=project_id, dataset=dataset) 831 response = self.client.datasets.Insert(request) 832 self.created_temp_dataset = True 833 # The response is a bigquery.Dataset instance. 834 return response 835 else: 836 raise 837 838 @retry.with_exponential_backoff( 839 num_retries=MAX_RETRIES, 840 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 841 def _is_table_empty(self, project_id, dataset_id, table_id): 842 request = bigquery.BigqueryTabledataListRequest( 843 projectId=project_id, 844 datasetId=dataset_id, 845 tableId=table_id, 846 maxResults=1) 847 response = self.client.tabledata.List(request) 848 # The response is a bigquery.TableDataList instance. 849 return response.totalRows == 0 850 851 @retry.with_exponential_backoff( 852 num_retries=MAX_RETRIES, 853 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 854 def _delete_table(self, project_id, dataset_id, table_id): 855 request = bigquery.BigqueryTablesDeleteRequest( 856 projectId=project_id, datasetId=dataset_id, tableId=table_id) 857 try: 858 self.client.tables.Delete(request) 859 except HttpError as exn: 860 if exn.status_code == 404: 861 _LOGGER.warning( 862 'Table %s:%s.%s does not exist', project_id, dataset_id, table_id) 863 return 864 else: 865 raise 866 867 @retry.with_exponential_backoff( 868 num_retries=MAX_RETRIES, 869 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 870 def _delete_dataset(self, project_id, dataset_id, delete_contents=True): 871 request = bigquery.BigqueryDatasetsDeleteRequest( 872 projectId=project_id, 873 datasetId=dataset_id, 874 deleteContents=delete_contents) 875 try: 876 self.client.datasets.Delete(request) 877 except HttpError as exn: 878 if exn.status_code == 404: 879 _LOGGER.warning('Dataset %s:%s does not exist', project_id, dataset_id) 880 return 881 else: 882 raise 883 884 @retry.with_exponential_backoff( 885 num_retries=MAX_RETRIES, 886 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 887 def get_table_location(self, project_id, dataset_id, table_id): 888 table = self.get_table(project_id, dataset_id, table_id) 889 return table.location 890 891 # Returns true if the temporary dataset was provided by the user. 892 def is_user_configured_dataset(self): 893 return ( 894 self.temp_dataset_id and 895 not self.temp_dataset_id.startswith(self.TEMP_DATASET)) 896 897 @retry.with_exponential_backoff( 898 num_retries=MAX_RETRIES, 899 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 900 def create_temporary_dataset(self, project_id, location, labels=None): 901 self.get_or_create_dataset( 902 project_id, self.temp_dataset_id, location=location, labels=labels) 903 904 if (project_id is not None and not self.is_user_configured_dataset() and 905 not self.created_temp_dataset): 906 # Unittests don't pass projectIds so they can be run without error 907 # User configured datasets are allowed to pre-exist. 908 raise RuntimeError( 909 'Dataset %s:%s already exists so cannot be used as temporary.' % 910 (project_id, self.temp_dataset_id)) 911 912 @retry.with_exponential_backoff( 913 num_retries=MAX_RETRIES, 914 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 915 def clean_up_temporary_dataset(self, project_id): 916 temp_table = self._get_temp_table(project_id) 917 try: 918 self.client.datasets.Get( 919 bigquery.BigqueryDatasetsGetRequest( 920 projectId=project_id, datasetId=temp_table.datasetId)) 921 except HttpError as exn: 922 if exn.status_code == 404: 923 _LOGGER.warning( 924 'Dataset %s:%s does not exist', project_id, temp_table.datasetId) 925 return 926 else: 927 raise 928 try: 929 # We do not want to delete temporary datasets configured by the user hence 930 # we just delete the temporary table in that case. 931 if not self.is_user_configured_dataset(): 932 self._delete_dataset(temp_table.projectId, temp_table.datasetId, True) 933 else: 934 self._delete_table( 935 temp_table.projectId, temp_table.datasetId, temp_table.tableId) 936 self.created_temp_dataset = False 937 except HttpError as exn: 938 if exn.status_code == 403: 939 _LOGGER.warning( 940 'Permission denied to delete temporary dataset %s:%s for clean up', 941 temp_table.projectId, 942 temp_table.datasetId) 943 return 944 else: 945 raise 946 947 @retry.with_exponential_backoff( 948 num_retries=MAX_RETRIES, 949 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 950 def _clean_up_beam_labelled_temporary_datasets( 951 self, project_id, dataset_id=None, table_id=None, labels=None): 952 if isinstance(labels, dict): 953 filter_str = _build_filter_from_labels(labels) 954 955 if not self.is_user_configured_dataset() and labels is not None: 956 response = ( 957 self.client.datasets.List( 958 bigquery.BigqueryDatasetsListRequest( 959 projectId=project_id, filter=filter_str))) 960 for dataset in response.datasets: 961 try: 962 dataset_id = dataset.datasetReference.datasetId 963 self._delete_dataset(project_id, dataset_id, True) 964 except HttpError as exn: 965 if exn.status_code == 403: 966 _LOGGER.warning( 967 'Permission denied to delete temporary dataset %s:%s for ' 968 'clean up.', 969 project_id, 970 dataset_id) 971 return 972 else: 973 raise 974 else: 975 try: 976 self._delete_table(project_id, dataset_id, table_id) 977 except HttpError as exn: 978 if exn.status_code == 403: 979 _LOGGER.warning( 980 'Permission denied to delete temporary table %s:%s.%s for ' 981 'clean up.', 982 project_id, 983 dataset_id, 984 table_id) 985 return 986 else: 987 raise 988 989 @retry.with_exponential_backoff( 990 num_retries=MAX_RETRIES, 991 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 992 def get_job(self, project, job_id, location=None): 993 request = bigquery.BigqueryJobsGetRequest() 994 request.jobId = job_id 995 request.projectId = project 996 request.location = location 997 998 return self.client.jobs.Get(request) 999 1000 def perform_load_job( 1001 self, 1002 destination, 1003 job_id, 1004 source_uris=None, 1005 source_stream=None, 1006 schema=None, 1007 write_disposition=None, 1008 create_disposition=None, 1009 additional_load_parameters=None, 1010 source_format=None, 1011 job_labels=None, 1012 load_job_project_id=None): 1013 """Starts a job to load data into BigQuery. 1014 1015 Returns: 1016 bigquery.JobReference with the information about the job that was started. 1017 """ 1018 project_id = ( 1019 destination.projectId 1020 if load_job_project_id is None else load_job_project_id) 1021 1022 return self._insert_load_job( 1023 project_id, 1024 job_id, 1025 destination, 1026 source_uris=source_uris, 1027 source_stream=source_stream, 1028 schema=schema, 1029 create_disposition=create_disposition, 1030 write_disposition=write_disposition, 1031 additional_load_parameters=additional_load_parameters, 1032 source_format=source_format, 1033 job_labels=job_labels) 1034 1035 @retry.with_exponential_backoff( 1036 num_retries=MAX_RETRIES, 1037 retry_filter=retry.retry_on_server_errors_and_timeout_filter) 1038 def perform_extract_job( 1039 self, 1040 destination, 1041 job_id, 1042 table_reference, 1043 destination_format, 1044 project=None, 1045 include_header=True, 1046 compression=ExportCompression.NONE, 1047 use_avro_logical_types=False, 1048 job_labels=None): 1049 """Starts a job to export data from BigQuery. 1050 1051 Returns: 1052 bigquery.JobReference with the information about the job that was started. 1053 """ 1054 job_project = project or table_reference.projectId 1055 job_reference = bigquery.JobReference(jobId=job_id, projectId=job_project) 1056 request = bigquery.BigqueryJobsInsertRequest( 1057 projectId=job_project, 1058 job=bigquery.Job( 1059 configuration=bigquery.JobConfiguration( 1060 extract=bigquery.JobConfigurationExtract( 1061 destinationUris=destination, 1062 sourceTable=table_reference, 1063 printHeader=include_header, 1064 destinationFormat=destination_format, 1065 compression=compression, 1066 useAvroLogicalTypes=use_avro_logical_types, 1067 ), 1068 labels=_build_job_labels(job_labels), 1069 ), 1070 jobReference=job_reference, 1071 )) 1072 return self._start_job(request).jobReference 1073 1074 @retry.with_exponential_backoff( 1075 num_retries=MAX_RETRIES, 1076 retry_filter=retry. 1077 retry_if_valid_input_but_server_error_and_timeout_filter) 1078 def get_or_create_table( 1079 self, 1080 project_id, 1081 dataset_id, 1082 table_id, 1083 schema, 1084 create_disposition, 1085 write_disposition, 1086 additional_create_parameters=None): 1087 """Gets or creates a table based on create and write dispositions. 1088 1089 The function mimics the behavior of BigQuery import jobs when using the 1090 same create and write dispositions. 1091 1092 Args: 1093 project_id: The project id owning the table. 1094 dataset_id: The dataset id owning the table. 1095 table_id: The table id. 1096 schema: A bigquery.TableSchema instance or None. 1097 create_disposition: CREATE_NEVER or CREATE_IF_NEEDED. 1098 write_disposition: WRITE_APPEND, WRITE_EMPTY or WRITE_TRUNCATE. 1099 1100 Returns: 1101 A bigquery.Table instance if table was found or created. 1102 1103 Raises: 1104 `RuntimeError`: For various mismatches between the state of the table and 1105 the create/write dispositions passed in. For example if the table is not 1106 empty and WRITE_EMPTY was specified then an error will be raised since 1107 the table was expected to be empty. 1108 """ 1109 from apache_beam.io.gcp.bigquery import BigQueryDisposition 1110 1111 found_table = None 1112 try: 1113 found_table = self.get_table(project_id, dataset_id, table_id) 1114 except HttpError as exn: 1115 if exn.status_code == 404: 1116 if create_disposition == BigQueryDisposition.CREATE_NEVER: 1117 raise RuntimeError( 1118 'Table %s:%s.%s not found but create disposition is CREATE_NEVER.' 1119 % (project_id, dataset_id, table_id)) 1120 else: 1121 raise 1122 1123 # If table exists already then handle the semantics for WRITE_EMPTY and 1124 # WRITE_TRUNCATE write dispositions. 1125 if found_table and write_disposition in ( 1126 BigQueryDisposition.WRITE_EMPTY, BigQueryDisposition.WRITE_TRUNCATE): 1127 # Delete the table and recreate it (later) if WRITE_TRUNCATE was 1128 # specified. 1129 if write_disposition == BigQueryDisposition.WRITE_TRUNCATE: 1130 self._delete_table(project_id, dataset_id, table_id) 1131 elif (write_disposition == BigQueryDisposition.WRITE_EMPTY and 1132 not self._is_table_empty(project_id, dataset_id, table_id)): 1133 raise RuntimeError( 1134 'Table %s:%s.%s is not empty but write disposition is WRITE_EMPTY.' 1135 % (project_id, dataset_id, table_id)) 1136 1137 # Create a new table potentially reusing the schema from a previously 1138 # found table in case the schema was not specified. 1139 if schema is None and found_table is None: 1140 raise RuntimeError( 1141 'Table %s:%s.%s requires a schema. None can be inferred because the ' 1142 'table does not exist.' % (project_id, dataset_id, table_id)) 1143 if found_table and write_disposition != BigQueryDisposition.WRITE_TRUNCATE: 1144 return found_table 1145 else: 1146 created_table = None 1147 try: 1148 created_table = self._create_table( 1149 project_id=project_id, 1150 dataset_id=dataset_id, 1151 table_id=table_id, 1152 schema=schema or found_table.schema, 1153 additional_parameters=additional_create_parameters) 1154 except HttpError as exn: 1155 if exn.status_code == 409: 1156 _LOGGER.debug( 1157 'Skipping Creation. Table %s:%s.%s already exists.' % 1158 (project_id, dataset_id, table_id)) 1159 created_table = self.get_table(project_id, dataset_id, table_id) 1160 else: 1161 raise 1162 _LOGGER.info( 1163 'Created table %s.%s.%s with schema %s. ' 1164 'Result: %s.', 1165 project_id, 1166 dataset_id, 1167 table_id, 1168 schema or found_table.schema, 1169 created_table) 1170 # if write_disposition == BigQueryDisposition.WRITE_TRUNCATE we delete 1171 # the table before this point. 1172 if write_disposition == BigQueryDisposition.WRITE_TRUNCATE: 1173 # BigQuery can route data to the old table for 2 mins max so wait 1174 # that much time before creating the table and writing it 1175 _LOGGER.warning( 1176 'Sleeping for 150 seconds before the write as ' + 1177 'BigQuery inserts can be routed to deleted table ' + 1178 'for 2 mins after the delete and create.') 1179 # TODO(BEAM-2673): Remove this sleep by migrating to load api 1180 time.sleep(150) 1181 return created_table 1182 else: 1183 return created_table 1184 1185 def run_query( 1186 self, 1187 project_id, 1188 query, 1189 use_legacy_sql, 1190 flatten_results, 1191 priority, 1192 dry_run=False, 1193 job_labels=None): 1194 job = self._start_query_job( 1195 project_id, 1196 query, 1197 use_legacy_sql, 1198 flatten_results, 1199 job_id=uuid.uuid4().hex, 1200 priority=priority, 1201 dry_run=dry_run, 1202 job_labels=job_labels) 1203 job_id = job.jobReference.jobId 1204 location = job.jobReference.location 1205 1206 if dry_run: 1207 # If this was a dry run then the fact that we get here means the 1208 # query has no errors. The start_query_job would raise an error otherwise. 1209 return 1210 page_token = None 1211 while True: 1212 response = self._get_query_results( 1213 project_id, job_id, page_token, location=location) 1214 if not response.jobComplete: 1215 # The jobComplete field can be False if the query request times out 1216 # (default is 10 seconds). Note that this is a timeout for the query 1217 # request not for the actual execution of the query in the service. If 1218 # the request times out we keep trying. This situation is quite possible 1219 # if the query will return a large number of rows. 1220 _LOGGER.info('Waiting on response from query: %s ...', query) 1221 time.sleep(1.0) 1222 continue 1223 # We got some results. The last page is signalled by a missing pageToken. 1224 yield response.rows, response.schema 1225 if not response.pageToken: 1226 break 1227 page_token = response.pageToken 1228 1229 def insert_rows( 1230 self, 1231 project_id, 1232 dataset_id, 1233 table_id, 1234 rows, 1235 insert_ids=None, 1236 skip_invalid_rows=False, 1237 ignore_unknown_values=False): 1238 """Inserts rows into the specified table. 1239 1240 Args: 1241 project_id: The project id owning the table. 1242 dataset_id: The dataset id owning the table. 1243 table_id: The table id. 1244 rows: A list of plain Python dictionaries. Each dictionary is a row and 1245 each key in it is the name of a field. 1246 skip_invalid_rows: If there are rows with insertion errors, whether they 1247 should be skipped, and all others should be inserted successfully. 1248 ignore_unknown_values: Set this option to true to ignore unknown column 1249 names. If the input rows contain columns that are not 1250 part of the existing table's schema, those columns are ignored, and 1251 the rows are successfully inserted. 1252 1253 Returns: 1254 A tuple (bool, errors). If first element is False then the second element 1255 will be a bigquery.InsertErrorsValueListEntry instance containing 1256 specific errors. 1257 """ 1258 1259 # Prepare rows for insertion. Of special note is the row ID that we add to 1260 # each row in order to help BigQuery avoid inserting a row multiple times. 1261 # BigQuery will do a best-effort if unique IDs are provided. This situation 1262 # can happen during retries on failures. 1263 # TODO(silviuc): Must add support to writing TableRow's instead of dicts. 1264 insert_ids = [ 1265 str(self.unique_row_id) if not insert_ids else insert_ids[i] for i, 1266 _ in enumerate(rows) 1267 ] 1268 rows = [ 1269 fast_json_loads(fast_json_dumps(r, default=default_encoder)) 1270 for r in rows 1271 ] 1272 1273 result, errors = self._insert_all_rows( 1274 project_id, dataset_id, table_id, rows, insert_ids, 1275 skip_invalid_rows=skip_invalid_rows, 1276 ignore_unknown_values=ignore_unknown_values) 1277 return result, errors 1278 1279 def _convert_cell_value_to_dict(self, value, field): 1280 if field.type == 'STRING': 1281 # Input: "XYZ" --> Output: "XYZ" 1282 return value 1283 elif field.type == 'BOOLEAN': 1284 # Input: "true" --> Output: True 1285 return value == 'true' 1286 elif field.type == 'INTEGER': 1287 # Input: "123" --> Output: 123 1288 return int(value) 1289 elif field.type == 'FLOAT': 1290 # Input: "1.23" --> Output: 1.23 1291 return float(value) 1292 elif field.type == 'TIMESTAMP': 1293 # The UTC should come from the timezone library but this is a known 1294 # issue in python 2.7 so we'll just hardcode it as we're reading using 1295 # utcfromtimestamp. 1296 # Input: 1478134176.985864 --> Output: "2016-11-03 00:49:36.985864 UTC" 1297 dt = datetime.datetime.utcfromtimestamp(float(value)) 1298 return dt.strftime('%Y-%m-%d %H:%M:%S.%f UTC') 1299 elif field.type == 'BYTES': 1300 # Input: "YmJi" --> Output: "YmJi" 1301 return value 1302 elif field.type == 'DATE': 1303 # Input: "2016-11-03" --> Output: "2016-11-03" 1304 return value 1305 elif field.type == 'DATETIME': 1306 # Input: "2016-11-03T00:49:36" --> Output: "2016-11-03T00:49:36" 1307 return value 1308 elif field.type == 'TIME': 1309 # Input: "00:49:36" --> Output: "00:49:36" 1310 return value 1311 elif field.type == 'RECORD': 1312 # Note that a schema field object supports also a RECORD type. However 1313 # when querying, the repeated and/or record fields are flattened 1314 # unless we pass the flatten_results flag as False to the source 1315 return self.convert_row_to_dict(value, field) 1316 elif field.type == 'NUMERIC': 1317 return decimal.Decimal(value) 1318 elif field.type == 'GEOGRAPHY': 1319 return value 1320 else: 1321 raise RuntimeError('Unexpected field type: %s' % field.type) 1322 1323 def convert_row_to_dict(self, row, schema): 1324 """Converts a TableRow instance using the schema to a Python dict.""" 1325 result = {} 1326 for index, field in enumerate(schema.fields): 1327 value = None 1328 if isinstance(schema, bigquery.TableSchema): 1329 cell = row.f[index] 1330 value = from_json_value(cell.v) if cell.v is not None else None 1331 elif isinstance(schema, bigquery.TableFieldSchema): 1332 cell = row['f'][index] 1333 value = cell['v'] if 'v' in cell else None 1334 if field.mode == 'REPEATED': 1335 if value is None: 1336 # Ideally this should never happen as repeated fields default to 1337 # returning an empty list 1338 result[field.name] = [] 1339 else: 1340 result[field.name] = [ 1341 self._convert_cell_value_to_dict(x['v'], field) for x in value 1342 ] 1343 elif value is None: 1344 if not field.mode == 'NULLABLE': 1345 raise ValueError( 1346 'Received \'None\' as the value for the field %s ' 1347 'but the field is not NULLABLE.' % field.name) 1348 result[field.name] = None 1349 else: 1350 result[field.name] = self._convert_cell_value_to_dict(value, field) 1351 return result 1352 1353 1354 class RowAsDictJsonCoder(coders.Coder): 1355 """A coder for a table row (represented as a dict) to/from a JSON string. 1356 1357 This is the default coder for sources and sinks if the coder argument is not 1358 specified. 1359 """ 1360 def encode(self, table_row): 1361 # The normal error when dumping NAN/INF values is: 1362 # ValueError: Out of range float values are not JSON compliant 1363 # This code will catch this error to emit an error that explains 1364 # to the programmer that they have used NAN/INF values. 1365 try: 1366 return json.dumps( 1367 table_row, 1368 allow_nan=False, 1369 ensure_ascii=False, 1370 default=default_encoder).encode('utf-8') 1371 except ValueError as e: 1372 raise ValueError( 1373 '%s. %s. Row: %r' % (e, JSON_COMPLIANCE_ERROR, table_row)) 1374 1375 def decode(self, encoded_table_row): 1376 return json.loads(encoded_table_row.decode('utf-8')) 1377 1378 def to_type_hint(self): 1379 return Any 1380 1381 1382 class JsonRowWriter(io.IOBase): 1383 """ 1384 A writer which provides an IOBase-like interface for writing table rows 1385 (represented as dicts) as newline-delimited JSON strings. 1386 """ 1387 def __init__(self, file_handle): 1388 """Initialize an JsonRowWriter. 1389 1390 Args: 1391 file_handle (io.IOBase): Output stream to write to. 1392 """ 1393 if not file_handle.writable(): 1394 raise ValueError("Output stream must be writable") 1395 1396 self._file_handle = file_handle 1397 self._coder = RowAsDictJsonCoder() 1398 1399 def close(self): 1400 self._file_handle.close() 1401 1402 @property 1403 def closed(self): 1404 return self._file_handle.closed 1405 1406 def flush(self): 1407 self._file_handle.flush() 1408 1409 def read(self, size=-1): 1410 raise io.UnsupportedOperation("JsonRowWriter is not readable") 1411 1412 def tell(self): 1413 return self._file_handle.tell() 1414 1415 def writable(self): 1416 return self._file_handle.writable() 1417 1418 def write(self, row): 1419 return self._file_handle.write(self._coder.encode(row) + b'\n') 1420 1421 1422 class AvroRowWriter(io.IOBase): 1423 """ 1424 A writer which provides an IOBase-like interface for writing table rows 1425 (represented as dicts) as Avro records. 1426 """ 1427 def __init__(self, file_handle, schema): 1428 """Initialize an AvroRowWriter. 1429 1430 Args: 1431 file_handle (io.IOBase): Output stream to write Avro records to. 1432 schema (Dict[Text, Any]): BigQuery table schema. 1433 """ 1434 if not file_handle.writable(): 1435 raise ValueError("Output stream must be writable") 1436 1437 self._file_handle = file_handle 1438 avro_schema = fastavro.parse_schema( 1439 get_avro_schema_from_table_schema(schema)) 1440 self._avro_writer = fastavro.write.Writer(self._file_handle, avro_schema) 1441 1442 def close(self): 1443 if not self._file_handle.closed: 1444 self.flush() 1445 self._file_handle.close() 1446 1447 @property 1448 def closed(self): 1449 return self._file_handle.closed 1450 1451 def flush(self): 1452 if self._file_handle.closed: 1453 raise ValueError("flush on closed file") 1454 1455 self._avro_writer.flush() 1456 self._file_handle.flush() 1457 1458 def read(self, size=-1): 1459 raise io.UnsupportedOperation("AvroRowWriter is not readable") 1460 1461 def tell(self): 1462 # Flush the fastavro Writer to the underlying stream, otherwise there isn't 1463 # a reliable way to determine how many bytes have been written. 1464 self._avro_writer.flush() 1465 return self._file_handle.tell() 1466 1467 def writable(self): 1468 return self._file_handle.writable() 1469 1470 def write(self, row): 1471 try: 1472 self._avro_writer.write(row) 1473 except (TypeError, ValueError) as ex: 1474 _, _, tb = sys.exc_info() 1475 raise ex.__class__( 1476 "Error writing row to Avro: {}\nSchema: {}\nRow: {}".format( 1477 ex, self._avro_writer.schema, row)).with_traceback(tb) 1478 1479 1480 class RetryStrategy(object): 1481 RETRY_ALWAYS = 'RETRY_ALWAYS' 1482 RETRY_NEVER = 'RETRY_NEVER' 1483 RETRY_ON_TRANSIENT_ERROR = 'RETRY_ON_TRANSIENT_ERROR' 1484 1485 _NON_TRANSIENT_ERRORS = {'invalid', 'invalidQuery', 'notImplemented'} 1486 1487 @staticmethod 1488 def should_retry(strategy, error_message): 1489 if strategy == RetryStrategy.RETRY_ALWAYS: 1490 return True 1491 elif strategy == RetryStrategy.RETRY_NEVER: 1492 return False 1493 elif (strategy == RetryStrategy.RETRY_ON_TRANSIENT_ERROR and 1494 error_message not in RetryStrategy._NON_TRANSIENT_ERRORS): 1495 return True 1496 else: 1497 return False 1498 1499 1500 class AppendDestinationsFn(DoFn): 1501 """Adds the destination to an element, making it a KV pair. 1502 1503 Outputs a PCollection of KV-pairs where the key is a TableReference for the 1504 destination, and the value is the record itself. 1505 1506 Experimental; no backwards compatibility guarantees. 1507 """ 1508 def __init__(self, destination): 1509 self._display_destination = destination 1510 self.destination = AppendDestinationsFn._get_table_fn(destination) 1511 1512 def display_data(self): 1513 return {'destination': str(self._display_destination)} 1514 1515 @staticmethod 1516 def _value_provider_or_static_val(elm): 1517 if isinstance(elm, value_provider.ValueProvider): 1518 return elm 1519 else: 1520 # The type argument is a NoOp, because we assume the argument already has 1521 # the proper formatting. 1522 return value_provider.StaticValueProvider(lambda x: x, value=elm) 1523 1524 @staticmethod 1525 def _get_table_fn(destination): 1526 if callable(destination): 1527 return destination 1528 else: 1529 return lambda x: AppendDestinationsFn._value_provider_or_static_val( 1530 destination).get() 1531 1532 def process(self, element, *side_inputs): 1533 yield (self.destination(element, *side_inputs), element) 1534 1535 1536 def beam_row_from_dict(row: dict, schema): 1537 """Converts a dictionary row to a Beam Row. 1538 Nested records and lists are supported. 1539 1540 Args: 1541 row (dict): 1542 The row to convert. 1543 schema (str, dict, ~apache_beam.io.gcp.internal.clients.bigquery.\ 1544 bigquery_v2_messages.TableSchema): 1545 The table schema. Will be used to help convert the row. 1546 1547 Returns: 1548 ~apache_beam.pvalue.Row: The converted row. 1549 """ 1550 if not isinstance(schema, (bigquery.TableSchema, bigquery.TableFieldSchema)): 1551 schema = get_bq_tableschema(schema) 1552 schema_fields = {field.name: field for field in schema.fields} 1553 beam_row = {} 1554 for col_name, value in row.items(): 1555 # get this column's schema field and handle struct types 1556 field = schema_fields[col_name] 1557 if field.type.upper() in ["RECORD", "STRUCT"]: 1558 # if this is a list of records, we create a list of Beam Rows 1559 if field.mode.upper() == "REPEATED": 1560 list_of_beam_rows = [] 1561 for record in value: 1562 list_of_beam_rows.append(beam_row_from_dict(record, field)) 1563 beam_row[col_name] = list_of_beam_rows 1564 # otherwise, create a Beam Row from this record 1565 else: 1566 beam_row[col_name] = beam_row_from_dict(value, field) 1567 else: 1568 beam_row[col_name] = value 1569 return apache_beam.pvalue.Row(**beam_row) 1570 1571 1572 def get_table_schema_from_string(schema): 1573 """Transform the string table schema into a 1574 :class:`~apache_beam.io.gcp.internal.clients.bigquery.\ 1575 bigquery_v2_messages.TableSchema` instance. 1576 1577 Args: 1578 schema (str): The sting schema to be used if the BigQuery table to write 1579 has to be created. 1580 1581 Returns: 1582 ~apache_beam.io.gcp.internal.clients.bigquery.\ 1583 bigquery_v2_messages.TableSchema: 1584 The schema to be used if the BigQuery table to write has to be created 1585 but in the :class:`~apache_beam.io.gcp.internal.clients.bigquery.\ 1586 bigquery_v2_messages.TableSchema` format. 1587 """ 1588 table_schema = bigquery.TableSchema() 1589 schema_list = [s.strip() for s in schema.split(',')] 1590 for field_and_type in schema_list: 1591 field_name, field_type = field_and_type.split(':') 1592 field_schema = bigquery.TableFieldSchema() 1593 field_schema.name = field_name 1594 field_schema.type = field_type 1595 field_schema.mode = 'NULLABLE' 1596 table_schema.fields.append(field_schema) 1597 return table_schema 1598 1599 1600 def table_schema_to_dict(table_schema): 1601 """Create a dictionary representation of table schema for serialization 1602 """ 1603 def get_table_field(field): 1604 """Create a dictionary representation of a table field 1605 """ 1606 result = {} 1607 result['name'] = field.name 1608 result['type'] = field.type 1609 result['mode'] = getattr(field, 'mode', 'NULLABLE') 1610 if hasattr(field, 'description') and field.description is not None: 1611 result['description'] = field.description 1612 if hasattr(field, 'fields') and field.fields: 1613 result['fields'] = [get_table_field(f) for f in field.fields] 1614 return result 1615 1616 if not isinstance(table_schema, bigquery.TableSchema): 1617 raise ValueError("Table schema must be of the type bigquery.TableSchema") 1618 schema = {'fields': []} 1619 for field in table_schema.fields: 1620 schema['fields'].append(get_table_field(field)) 1621 return schema 1622 1623 1624 def get_dict_table_schema(schema): 1625 """Transform the table schema into a dictionary instance. 1626 1627 Args: 1628 schema (str, dict, ~apache_beam.io.gcp.internal.clients.bigquery.\ 1629 bigquery_v2_messages.TableSchema): 1630 The schema to be used if the BigQuery table to write has to be created. 1631 This can either be a dict or string or in the TableSchema format. 1632 1633 Returns: 1634 Dict[str, Any]: The schema to be used if the BigQuery table to write has 1635 to be created but in the dictionary format. 1636 """ 1637 if (isinstance(schema, (dict, value_provider.ValueProvider)) or 1638 callable(schema) or schema is None): 1639 return schema 1640 elif isinstance(schema, str): 1641 table_schema = get_table_schema_from_string(schema) 1642 return table_schema_to_dict(table_schema) 1643 elif isinstance(schema, bigquery.TableSchema): 1644 return table_schema_to_dict(schema) 1645 else: 1646 raise TypeError('Unexpected schema argument: %s.' % schema) 1647 1648 1649 def get_bq_tableschema(schema): 1650 """Convert the table schema to a TableSchema object. 1651 1652 Args: 1653 schema (str, dict, ~apache_beam.io.gcp.internal.clients.bigquery.\ 1654 bigquery_v2_messages.TableSchema): 1655 The schema to be used if the BigQuery table to write has to be created. 1656 This can either be a dict or string or in the TableSchema format. 1657 1658 Returns: 1659 ~apache_beam.io.gcp.internal.clients.bigquery.\ 1660 bigquery_v2_messages.TableSchema: The schema as a TableSchema object. 1661 """ 1662 if (isinstance(schema, 1663 (bigquery.TableSchema, value_provider.ValueProvider)) or 1664 callable(schema) or schema is None): 1665 return schema 1666 elif isinstance(schema, str): 1667 return get_table_schema_from_string(schema) 1668 elif isinstance(schema, dict): 1669 schema_string = json.dumps(schema) 1670 return parse_table_schema_from_json(schema_string) 1671 else: 1672 raise TypeError('Unexpected schema argument: %s.' % schema) 1673 1674 1675 def get_avro_schema_from_table_schema(schema): 1676 """Transform the table schema into an Avro schema. 1677 1678 Args: 1679 schema (str, dict, ~apache_beam.io.gcp.internal.clients.bigquery.\ 1680 bigquery_v2_messages.TableSchema): 1681 The TableSchema to convert to Avro schema. This can either be a dict or 1682 string or in the TableSchema format. 1683 1684 Returns: 1685 Dict[str, Any]: An Avro schema, which can be used by fastavro. 1686 """ 1687 dict_table_schema = get_dict_table_schema(schema) 1688 return bigquery_avro_tools.get_record_schema_from_dict_table_schema( 1689 "root", dict_table_schema) 1690 1691 1692 def get_beam_typehints_from_tableschema(schema): 1693 """Extracts Beam Python type hints from the schema. 1694 1695 Args: 1696 schema (~apache_beam.io.gcp.internal.clients.bigquery.\ 1697 bigquery_v2_messages.TableSchema): 1698 The TableSchema to extract type hints from. 1699 1700 Returns: 1701 List[Tuple[str, Any]]: A list of type hints that describe the input schema. 1702 Nested and repeated fields are supported. 1703 """ 1704 if not isinstance(schema, (bigquery.TableSchema, bigquery.TableFieldSchema)): 1705 schema = get_bq_tableschema(schema) 1706 typehints = [] 1707 for field in schema.fields: 1708 name, field_type, mode = field.name, field.type.upper(), field.mode.upper() 1709 1710 if field_type in ["STRUCT", "RECORD"]: 1711 # Structs can be represented as Beam Rows. 1712 typehint = RowTypeConstraint.from_fields( 1713 get_beam_typehints_from_tableschema(field)) 1714 elif field_type in BIGQUERY_TYPE_TO_PYTHON_TYPE: 1715 typehint = BIGQUERY_TYPE_TO_PYTHON_TYPE[field_type] 1716 else: 1717 raise ValueError( 1718 f"Converting BigQuery type [{field_type}] to " 1719 "Python Beam type is not supported.") 1720 1721 if mode == "REPEATED": 1722 typehint = Sequence[typehint] 1723 elif mode != "REQUIRED": 1724 typehint = Optional[typehint] 1725 1726 typehints.append((name, typehint)) 1727 return typehints 1728 1729 1730 class BigQueryJobTypes: 1731 EXPORT = 'EXPORT' 1732 COPY = 'COPY' 1733 LOAD = 'LOAD' 1734 QUERY = 'QUERY' 1735 1736 1737 def generate_bq_job_name(job_name, step_id, job_type, random=None): 1738 from apache_beam.io.gcp.bigquery import BQ_JOB_NAME_TEMPLATE 1739 random = ("_%s" % random) if random else "" 1740 return str.format( 1741 BQ_JOB_NAME_TEMPLATE, 1742 job_type=job_type, 1743 job_id=job_name.replace("-", ""), 1744 step_id=step_id, 1745 random=random) 1746 1747 1748 def check_schema_equal( 1749 left, right, *, ignore_descriptions=False, ignore_field_order=False): 1750 # type: (Union[bigquery.TableSchema, bigquery.TableFieldSchema], Union[bigquery.TableSchema, bigquery.TableFieldSchema], bool, bool) -> bool 1751 1752 """Check whether schemas are equivalent. 1753 1754 This comparison function differs from using == to compare TableSchema 1755 because it ignores categories, policy tags, descriptions (optionally), and 1756 field ordering (optionally). 1757 1758 Args: 1759 left (~apache_beam.io.gcp.internal.clients.bigquery.\ 1760 bigquery_v2_messages.TableSchema, ~apache_beam.io.gcp.internal.clients.\ 1761 bigquery.bigquery_v2_messages.TableFieldSchema): 1762 One schema to compare. 1763 right (~apache_beam.io.gcp.internal.clients.bigquery.\ 1764 bigquery_v2_messages.TableSchema, ~apache_beam.io.gcp.internal.clients.\ 1765 bigquery.bigquery_v2_messages.TableFieldSchema): 1766 The other schema to compare. 1767 ignore_descriptions (bool): (optional) Whether or not to ignore field 1768 descriptions when comparing. Defaults to False. 1769 ignore_field_order (bool): (optional) Whether or not to ignore struct field 1770 order when comparing. Defaults to False. 1771 1772 Returns: 1773 bool: True if the schemas are equivalent, False otherwise. 1774 """ 1775 if type(left) != type(right) or not isinstance( 1776 left, (bigquery.TableSchema, bigquery.TableFieldSchema)): 1777 return False 1778 1779 if isinstance(left, bigquery.TableFieldSchema): 1780 if left.name != right.name: 1781 return False 1782 1783 if left.type != right.type: 1784 # Check for type aliases 1785 if sorted( 1786 (left.type, right.type)) not in (["BOOL", "BOOLEAN"], ["FLOAT", 1787 "FLOAT64"], 1788 ["INT64", "INTEGER"], ["RECORD", 1789 "STRUCT"]): 1790 return False 1791 1792 if left.mode != right.mode: 1793 return False 1794 1795 if not ignore_descriptions and left.description != right.description: 1796 return False 1797 1798 if isinstance(left, 1799 bigquery.TableSchema) or left.type in ("RECORD", "STRUCT"): 1800 if len(left.fields) != len(right.fields): 1801 return False 1802 1803 if ignore_field_order: 1804 left_fields = sorted(left.fields, key=lambda field: field.name) 1805 right_fields = sorted(right.fields, key=lambda field: field.name) 1806 else: 1807 left_fields = left.fields 1808 right_fields = right.fields 1809 1810 for left_field, right_field in zip(left_fields, right_fields): 1811 if not check_schema_equal(left_field, 1812 right_field, 1813 ignore_descriptions=ignore_descriptions, 1814 ignore_field_order=ignore_field_order): 1815 return False 1816 1817 return True