github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_tools.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_tools.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Tools used by BigQuery sources and sinks.
    19  
    20  Classes, constants and functions in this file are experimental and have no
    21  backwards compatibility guarantees.
    22  
    23  These tools include wrappers and clients to interact with BigQuery APIs.
    24  
    25  NOTHING IN THIS FILE HAS BACKWARDS COMPATIBILITY GUARANTEES.
    26  """
    27  
    28  # pytype: skip-file
    29  
    30  import datetime
    31  import decimal
    32  import io
    33  import json
    34  import logging
    35  import sys
    36  import time
    37  import uuid
    38  from json.decoder import JSONDecodeError
    39  from typing import Optional
    40  from typing import Sequence
    41  from typing import Tuple
    42  from typing import TypeVar
    43  from typing import Union
    44  
    45  import fastavro
    46  import numpy as np
    47  import regex
    48  
    49  import apache_beam
    50  from apache_beam import coders
    51  from apache_beam.internal.gcp import auth
    52  from apache_beam.internal.gcp.json_value import from_json_value
    53  from apache_beam.internal.http_client import get_new_http
    54  from apache_beam.internal.metrics.metric import MetricLogger
    55  from apache_beam.internal.metrics.metric import Metrics
    56  from apache_beam.internal.metrics.metric import ServiceCallMetric
    57  from apache_beam.io.gcp import bigquery_avro_tools
    58  from apache_beam.io.gcp import resource_identifiers
    59  from apache_beam.io.gcp.internal.clients import bigquery
    60  from apache_beam.metrics import monitoring_infos
    61  from apache_beam.options import value_provider
    62  from apache_beam.options.pipeline_options import PipelineOptions
    63  from apache_beam.transforms import DoFn
    64  from apache_beam.typehints.row_type import RowTypeConstraint
    65  from apache_beam.typehints.typehints import Any
    66  from apache_beam.utils import retry
    67  from apache_beam.utils.histogram import LinearBucket
    68  
    69  # Protect against environments where bigquery library is not available.
    70  # pylint: disable=wrong-import-order, wrong-import-position
    71  try:
    72    from apitools.base.py.transfer import Upload
    73    from apitools.base.py.exceptions import HttpError, HttpForbiddenError
    74    from google.api_core.exceptions import ClientError, GoogleAPICallError
    75    from google.api_core.client_info import ClientInfo
    76    from google.cloud import bigquery as gcp_bigquery
    77  except ImportError:
    78    gcp_bigquery = None
    79    pass
    80  
    81  try:
    82    from orjson import dumps as fast_json_dumps
    83    from orjson import loads as fast_json_loads
    84  except ImportError:
    85    fast_json_dumps = json.dumps
    86    fast_json_loads = json.loads
    87  
    88  # pylint: enable=wrong-import-order, wrong-import-position
    89  
    90  # pylint: disable=wrong-import-order, wrong-import-position, ungrouped-imports
    91  try:
    92    from apache_beam.io.gcp.internal.clients.bigquery import TableReference
    93  except ImportError:
    94    TableReference = None
    95  # pylint: enable=wrong-import-order, wrong-import-position, ungrouped-imports
    96  
    97  _LOGGER = logging.getLogger(__name__)
    98  
    99  JSON_COMPLIANCE_ERROR = 'NAN, INF and -INF values are not JSON compliant.'
   100  MAX_RETRIES = 3
   101  UNKNOWN_MIME_TYPE = 'application/octet-stream'
   102  
   103  # Timeout for a BQ streaming insert RPC. Set to a maximum of 2 minutes.
   104  BQ_STREAMING_INSERT_TIMEOUT_SEC = 120
   105  
   106  _PROJECT_PATTERN = r'([a-z0-9.-]+:)?[a-z][a-z0-9-]*[a-z0-9]'
   107  _DATASET_PATTERN = r'\w{1,1024}'
   108  _TABLE_PATTERN = r'[\p{L}\p{M}\p{N}\p{Pc}\p{Pd}\p{Zs}$]{1,1024}'
   109  
   110  # TODO(https://github.com/apache/beam/issues/25946): Add support for
   111  # more Beam portable schema types as Python types
   112  BIGQUERY_TYPE_TO_PYTHON_TYPE = {
   113      "STRING": str,
   114      "BOOL": bool,
   115      "BOOLEAN": bool,
   116      "BYTES": bytes,
   117      "INT64": np.int64,
   118      "INTEGER": np.int64,
   119      "FLOAT64": np.float64,
   120      "FLOAT": np.float64,
   121      "NUMERIC": decimal.Decimal,
   122      "TIMESTAMP": apache_beam.utils.timestamp.Timestamp,
   123  }
   124  
   125  
   126  class FileFormat(object):
   127    CSV = 'CSV'
   128    JSON = 'NEWLINE_DELIMITED_JSON'
   129    AVRO = 'AVRO'
   130  
   131  
   132  class ExportCompression(object):
   133    GZIP = 'GZIP'
   134    DEFLATE = 'DEFLATE'
   135    SNAPPY = 'SNAPPY'
   136    NONE = 'NONE'
   137  
   138  
   139  def default_encoder(obj):
   140    if isinstance(obj, decimal.Decimal):
   141      return str(obj)
   142    elif isinstance(obj, bytes):
   143      # on python 3 base64-encoded bytes are decoded to strings
   144      # before being sent to BigQuery
   145      return obj.decode('utf-8')
   146    elif isinstance(obj, (datetime.date, datetime.time)):
   147      return str(obj)
   148    elif isinstance(obj, datetime.datetime):
   149      return obj.isoformat()
   150  
   151    _LOGGER.error("Unable to serialize %r to JSON", obj)
   152    raise TypeError(
   153        "Object of type '%s' is not JSON serializable" % type(obj).__name__)
   154  
   155  
   156  def get_hashable_destination(destination):
   157    """Parses a table reference into a (project, dataset, table) tuple.
   158  
   159    Args:
   160      destination: Either a TableReference object from the bigquery API.
   161        The object has the following attributes: projectId, datasetId, and
   162        tableId. Or a string representing the destination containing
   163        'PROJECT:DATASET.TABLE'.
   164    Returns:
   165      A string representing the destination containing
   166      'PROJECT:DATASET.TABLE'.
   167    """
   168    if isinstance(destination, TableReference):
   169      return '%s:%s.%s' % (
   170          destination.projectId, destination.datasetId, destination.tableId)
   171    else:
   172      return destination
   173  
   174  
   175  V = TypeVar('V')
   176  
   177  
   178  def to_hashable_table_ref(
   179      table_ref_elem_kv: Tuple[Union[str, TableReference], V]) -> Tuple[str, V]:
   180    """Turns the key of the input tuple to its string representation. The key
   181    should be either a string or a TableReference.
   182  
   183    Args:
   184      table_ref_elem_kv: A tuple of table reference and element.
   185  
   186    Returns:
   187      A tuple of string representation of input table and input element.
   188    """
   189    table_ref = table_ref_elem_kv[0]
   190    hashable_table_ref = get_hashable_destination(table_ref)
   191    return (hashable_table_ref, table_ref_elem_kv[1])
   192  
   193  
   194  def parse_table_schema_from_json(schema_string):
   195    """Parse the Table Schema provided as string.
   196  
   197    Args:
   198      schema_string: String serialized table schema, should be a valid JSON.
   199  
   200    Returns:
   201      A TableSchema of the BigQuery export from either the Query or the Table.
   202    """
   203    try:
   204      json_schema = json.loads(schema_string)
   205    except JSONDecodeError as e:
   206      raise ValueError(
   207          'Unable to parse JSON schema: %s - %r' % (schema_string, e))
   208  
   209    def _parse_schema_field(field):
   210      """Parse a single schema field from dictionary.
   211  
   212      Args:
   213        field: Dictionary object containing serialized schema.
   214  
   215      Returns:
   216        A TableFieldSchema for a single column in BigQuery.
   217      """
   218      schema = bigquery.TableFieldSchema()
   219      schema.name = field['name']
   220      schema.type = field['type']
   221      if 'mode' in field:
   222        schema.mode = field['mode']
   223      else:
   224        schema.mode = 'NULLABLE'
   225      if 'description' in field:
   226        schema.description = field['description']
   227      if 'fields' in field:
   228        schema.fields = [_parse_schema_field(x) for x in field['fields']]
   229      return schema
   230  
   231    fields = [_parse_schema_field(f) for f in json_schema['fields']]
   232    return bigquery.TableSchema(fields=fields)
   233  
   234  
   235  def parse_table_reference(table, dataset=None, project=None):
   236    """Parses a table reference into a (project, dataset, table) tuple.
   237  
   238    Args:
   239      table: The ID of the table. The ID must contain only letters
   240        (a-z, A-Z), numbers (0-9), connectors (-_). If dataset argument is None
   241        then the table argument must contain the entire table reference:
   242        'DATASET.TABLE' or 'PROJECT:DATASET.TABLE'. This argument can be a
   243        TableReference instance in which case dataset and project are
   244        ignored and the reference is returned as a result.  Additionally, for date
   245        partitioned tables, appending '$YYYYmmdd' to the table name is supported,
   246        e.g. 'DATASET.TABLE$YYYYmmdd'.
   247      dataset: The ID of the dataset containing this table or null if the table
   248        reference is specified entirely by the table argument.
   249      project: The ID of the project containing this table or null if the table
   250        reference is specified entirely by the table (and possibly dataset)
   251        argument.
   252  
   253    Returns:
   254      A TableReference object from the bigquery API. The object has the following
   255      attributes: projectId, datasetId, and tableId.
   256      If the input is a TableReference object, a new object will be returned.
   257  
   258    Raises:
   259      ValueError: if the table reference as a string does not match the expected
   260        format.
   261    """
   262  
   263    if isinstance(table, TableReference):
   264      return TableReference(
   265          projectId=table.projectId,
   266          datasetId=table.datasetId,
   267          tableId=table.tableId)
   268    elif callable(table):
   269      return table
   270    elif isinstance(table, value_provider.ValueProvider):
   271      return table
   272  
   273    table_reference = TableReference()
   274    # If dataset argument is not specified, the expectation is that the
   275    # table argument will contain a full table reference instead of just a
   276    # table name.
   277    if dataset is None:
   278      pattern = (
   279          f'((?P<project>{_PROJECT_PATTERN})[:\\.])?'
   280          f'(?P<dataset>{_DATASET_PATTERN})\\.(?P<table>{_TABLE_PATTERN})')
   281      match = regex.fullmatch(pattern, table)
   282      if not match:
   283        raise ValueError(
   284            'Expected a table reference (PROJECT:DATASET.TABLE or '
   285            'DATASET.TABLE) instead of %s.' % table)
   286      table_reference.projectId = match.group('project')
   287      table_reference.datasetId = match.group('dataset')
   288      table_reference.tableId = match.group('table')
   289    else:
   290      table_reference.projectId = project
   291      table_reference.datasetId = dataset
   292      table_reference.tableId = table
   293    return table_reference
   294  
   295  
   296  # -----------------------------------------------------------------------------
   297  # BigQueryWrapper.
   298  
   299  
   300  def _build_job_labels(input_labels):
   301    """Builds job label protobuf structure."""
   302    input_labels = input_labels or {}
   303    result = bigquery.JobConfiguration.LabelsValue()
   304  
   305    for k, v in input_labels.items():
   306      result.additionalProperties.append(
   307          bigquery.JobConfiguration.LabelsValue.AdditionalProperty(
   308              key=k,
   309              value=v,
   310          ))
   311    return result
   312  
   313  
   314  def _build_dataset_labels(input_labels):
   315    """Builds dataset label protobuf structure."""
   316    input_labels = input_labels or {}
   317    result = bigquery.Dataset.LabelsValue()
   318  
   319    for k, v in input_labels.items():
   320      result.additionalProperties.append(
   321          bigquery.Dataset.LabelsValue.AdditionalProperty(
   322              key=k,
   323              value=v,
   324          ))
   325    return result
   326  
   327  
   328  def _build_filter_from_labels(labels):
   329    filter_str = ''
   330    for key, value in labels.items():
   331      filter_str += 'labels.' + key + ':' + value + ' '
   332    return filter_str
   333  
   334  
   335  class BigQueryWrapper(object):
   336    """BigQuery client wrapper with utilities for querying.
   337  
   338    The wrapper is used to organize all the BigQuery integration points and
   339    offer a common place where retry logic for failures can be controlled.
   340    In addition, it offers various functions used both in sources and sinks
   341    (e.g., find and create tables, query a table, etc.).
   342    """
   343  
   344    # If updating following names, also update the corresponding pydocs in
   345    # bigquery.py.
   346    TEMP_TABLE = 'beam_temp_table_'
   347    TEMP_DATASET = 'beam_temp_dataset_'
   348  
   349    HISTOGRAM_METRIC_LOGGER = MetricLogger()
   350  
   351    def __init__(self, client=None, temp_dataset_id=None, temp_table_ref=None):
   352      self.client = client or bigquery.BigqueryV2(
   353          http=get_new_http(),
   354          credentials=auth.get_service_credentials(PipelineOptions()),
   355          response_encoding='utf8',
   356          additional_http_headers={
   357              "user-agent": "apache-beam-%s" % apache_beam.__version__
   358          })
   359      self.gcp_bq_client = client or gcp_bigquery.Client(
   360          client_info=ClientInfo(
   361              user_agent="apache-beam-%s" % apache_beam.__version__))
   362      self._unique_row_id = 0
   363      # For testing scenarios where we pass in a client we do not want a
   364      # randomized prefix for row IDs.
   365      self._row_id_prefix = '' if client else uuid.uuid4()
   366      self._latency_histogram_metric = Metrics.histogram(
   367          self.__class__,
   368          'latency_histogram_ms',
   369          LinearBucket(0, 20, 3000),
   370          BigQueryWrapper.HISTOGRAM_METRIC_LOGGER)
   371  
   372      if temp_dataset_id is not None and temp_table_ref is not None:
   373        raise ValueError(
   374            'Both a BigQuery temp_dataset_id and a temp_table_ref were specified.'
   375            ' Please specify only one of these.')
   376  
   377      if temp_dataset_id and temp_dataset_id.startswith(self.TEMP_DATASET):
   378        raise ValueError(
   379            'User provided temp dataset ID cannot start with %r' %
   380            self.TEMP_DATASET)
   381  
   382      if temp_table_ref is not None:
   383        self.temp_table_ref = temp_table_ref
   384        self.temp_dataset_id = temp_table_ref.datasetId
   385      else:
   386        self.temp_table_ref = None
   387        self._temporary_table_suffix = uuid.uuid4().hex
   388        self.temp_dataset_id = temp_dataset_id or self._get_temp_dataset()
   389  
   390      self.created_temp_dataset = False
   391  
   392    @property
   393    def unique_row_id(self):
   394      """Returns a unique row ID (str) used to avoid multiple insertions.
   395  
   396      If the row ID is provided, BigQuery will make a best effort to not insert
   397      the same row multiple times for fail and retry scenarios in which the insert
   398      request may be issued several times. This comes into play for sinks executed
   399      in a local runner.
   400  
   401      Returns:
   402        a unique row ID string
   403      """
   404      self._unique_row_id += 1
   405      return '%s_%d' % (self._row_id_prefix, self._unique_row_id)
   406  
   407    def _get_temp_table(self, project_id):
   408      if self.temp_table_ref:
   409        return self.temp_table_ref
   410  
   411      return parse_table_reference(
   412          table=BigQueryWrapper.TEMP_TABLE + self._temporary_table_suffix,
   413          dataset=self.temp_dataset_id,
   414          project=project_id)
   415  
   416    def _get_temp_dataset(self):
   417      if self.temp_table_ref:
   418        return self.temp_table_ref.datasetId
   419      return BigQueryWrapper.TEMP_DATASET + self._temporary_table_suffix
   420  
   421    @retry.with_exponential_backoff(
   422        num_retries=MAX_RETRIES,
   423        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   424    def get_query_location(self, project_id, query, use_legacy_sql):
   425      """
   426      Get the location of tables referenced in a query.
   427  
   428      This method returns the location of the first available referenced
   429      table for user in the query and depends on the BigQuery service to
   430      provide error handling for queries that reference tables in multiple
   431      locations.
   432      """
   433      reference = bigquery.JobReference(
   434          jobId=uuid.uuid4().hex, projectId=project_id)
   435      request = bigquery.BigqueryJobsInsertRequest(
   436          projectId=project_id,
   437          job=bigquery.Job(
   438              configuration=bigquery.JobConfiguration(
   439                  dryRun=True,
   440                  query=bigquery.JobConfigurationQuery(
   441                      query=query,
   442                      useLegacySql=use_legacy_sql,
   443                  )),
   444              jobReference=reference))
   445  
   446      response = self.client.jobs.Insert(request)
   447  
   448      if response.statistics is None:
   449        # This behavior is only expected in tests
   450        _LOGGER.warning(
   451            "Unable to get location, missing response.statistics. Query: %s",
   452            query)
   453        return None
   454  
   455      referenced_tables = response.statistics.query.referencedTables
   456      if referenced_tables:  # Guards against both non-empty and non-None
   457        for table in referenced_tables:
   458          try:
   459            location = self.get_table_location(
   460                table.projectId, table.datasetId, table.tableId)
   461          except HttpForbiddenError:
   462            # Permission access for table (i.e. from authorized_view),
   463            # try next one
   464            continue
   465          _LOGGER.info(
   466              "Using location %r from table %r referenced by query %s",
   467              location,
   468              table,
   469              query)
   470          return location
   471  
   472      _LOGGER.debug(
   473          "Query %s does not reference any tables or "
   474          "you don't have permission to inspect them.",
   475          query)
   476      return None
   477  
   478    @retry.with_exponential_backoff(
   479        num_retries=MAX_RETRIES,
   480        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   481    def _insert_copy_job(
   482        self,
   483        project_id,
   484        job_id,
   485        from_table_reference,
   486        to_table_reference,
   487        create_disposition=None,
   488        write_disposition=None,
   489        job_labels=None):
   490      reference = bigquery.JobReference()
   491      reference.jobId = job_id
   492      reference.projectId = project_id
   493      request = bigquery.BigqueryJobsInsertRequest(
   494          projectId=project_id,
   495          job=bigquery.Job(
   496              configuration=bigquery.JobConfiguration(
   497                  copy=bigquery.JobConfigurationTableCopy(
   498                      destinationTable=to_table_reference,
   499                      sourceTable=from_table_reference,
   500                      createDisposition=create_disposition,
   501                      writeDisposition=write_disposition,
   502                  ),
   503                  labels=_build_job_labels(job_labels),
   504              ),
   505              jobReference=reference,
   506          ))
   507  
   508      return self._start_job(request).jobReference
   509  
   510    @retry.with_exponential_backoff(
   511        num_retries=MAX_RETRIES,
   512        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   513    def _insert_load_job(
   514        self,
   515        project_id,
   516        job_id,
   517        table_reference,
   518        source_uris=None,
   519        source_stream=None,
   520        schema=None,
   521        write_disposition=None,
   522        create_disposition=None,
   523        additional_load_parameters=None,
   524        source_format=None,
   525        job_labels=None):
   526  
   527      if not source_uris and not source_stream:
   528        _LOGGER.warning(
   529            'Both source URIs and source stream are not provided. BigQuery load '
   530            'job will not load any data.')
   531  
   532      if source_uris and source_stream:
   533        raise ValueError(
   534            'Only one of source_uris and source_stream may be specified. '
   535            'Got both.')
   536  
   537      if source_uris is None:
   538        source_uris = []
   539  
   540      additional_load_parameters = additional_load_parameters or {}
   541      job_schema = None if schema == 'SCHEMA_AUTODETECT' else schema
   542      reference = bigquery.JobReference(jobId=job_id, projectId=project_id)
   543      request = bigquery.BigqueryJobsInsertRequest(
   544          projectId=project_id,
   545          job=bigquery.Job(
   546              configuration=bigquery.JobConfiguration(
   547                  load=bigquery.JobConfigurationLoad(
   548                      sourceUris=source_uris,
   549                      destinationTable=table_reference,
   550                      schema=job_schema,
   551                      writeDisposition=write_disposition,
   552                      createDisposition=create_disposition,
   553                      sourceFormat=source_format,
   554                      useAvroLogicalTypes=True,
   555                      autodetect=schema == 'SCHEMA_AUTODETECT',
   556                      **additional_load_parameters),
   557                  labels=_build_job_labels(job_labels),
   558              ),
   559              jobReference=reference,
   560          ))
   561      return self._start_job(request, stream=source_stream).jobReference
   562  
   563    def _start_job(
   564        self,
   565        request,  # type: bigquery.BigqueryJobsInsertRequest
   566        stream=None,
   567    ):
   568      """Inserts a BigQuery job.
   569  
   570      If the job exists already, it returns it.
   571  
   572      Args:
   573        request (bigquery.BigqueryJobsInsertRequest): An insert job request.
   574        stream (IO[bytes]): A bytes IO object open for reading.
   575      """
   576      try:
   577        upload = None
   578        if stream:
   579          upload = Upload.FromStream(stream, mime_type=UNKNOWN_MIME_TYPE)
   580        response = self.client.jobs.Insert(request, upload=upload)
   581        _LOGGER.info(
   582            "Started BigQuery job: %s\n "
   583            "bq show -j --format=prettyjson --project_id=%s %s",
   584            response.jobReference,
   585            response.jobReference.projectId,
   586            response.jobReference.jobId)
   587        return response
   588      except HttpError as exn:
   589        if exn.status_code == 409:
   590          _LOGGER.info(
   591              "BigQuery job %s already exists, will not retry inserting it: %s",
   592              request.job.jobReference,
   593              exn)
   594          return request.job
   595        else:
   596          _LOGGER.info(
   597              "Failed to insert job %s: %s", request.job.jobReference, exn)
   598          raise
   599  
   600    @retry.with_exponential_backoff(
   601        num_retries=MAX_RETRIES,
   602        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   603    def _start_query_job(
   604        self,
   605        project_id,
   606        query,
   607        use_legacy_sql,
   608        flatten_results,
   609        job_id,
   610        priority,
   611        dry_run=False,
   612        kms_key=None,
   613        job_labels=None):
   614      reference = bigquery.JobReference(jobId=job_id, projectId=project_id)
   615      request = bigquery.BigqueryJobsInsertRequest(
   616          projectId=project_id,
   617          job=bigquery.Job(
   618              configuration=bigquery.JobConfiguration(
   619                  dryRun=dry_run,
   620                  query=bigquery.JobConfigurationQuery(
   621                      query=query,
   622                      useLegacySql=use_legacy_sql,
   623                      allowLargeResults=not dry_run,
   624                      destinationTable=self._get_temp_table(project_id)
   625                      if not dry_run else None,
   626                      flattenResults=flatten_results,
   627                      priority=priority,
   628                      destinationEncryptionConfiguration=bigquery.
   629                      EncryptionConfiguration(kmsKeyName=kms_key)),
   630                  labels=_build_job_labels(job_labels),
   631              ),
   632              jobReference=reference))
   633  
   634      return self._start_job(request)
   635  
   636    def wait_for_bq_job(self, job_reference, sleep_duration_sec=5, max_retries=0):
   637      """Poll job until it is DONE.
   638  
   639      Args:
   640        job_reference: bigquery.JobReference instance.
   641        sleep_duration_sec: Specifies the delay in seconds between retries.
   642        max_retries: The total number of times to retry. If equals to 0,
   643          the function waits forever.
   644  
   645      Raises:
   646        `RuntimeError`: If the job is FAILED or the number of retries has been
   647          reached.
   648      """
   649      retry = 0
   650      while True:
   651        retry += 1
   652        job = self.get_job(
   653            job_reference.projectId, job_reference.jobId, job_reference.location)
   654        logging.info('Job status: %s', job.status.state)
   655        if job.status.state == 'DONE' and job.status.errorResult:
   656          raise RuntimeError(
   657              'BigQuery job {} failed. Error Result: {}'.format(
   658                  job_reference.jobId, job.status.errorResult))
   659        elif job.status.state == 'DONE':
   660          return True
   661        else:
   662          time.sleep(sleep_duration_sec)
   663          if max_retries != 0 and retry >= max_retries:
   664            raise RuntimeError('The maximum number of retries has been reached')
   665  
   666    @retry.with_exponential_backoff(
   667        num_retries=MAX_RETRIES,
   668        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   669    def _get_query_results(
   670        self,
   671        project_id,
   672        job_id,
   673        page_token=None,
   674        max_results=10000,
   675        location=None):
   676      request = bigquery.BigqueryJobsGetQueryResultsRequest(
   677          jobId=job_id,
   678          pageToken=page_token,
   679          projectId=project_id,
   680          maxResults=max_results,
   681          location=location)
   682      response = self.client.jobs.GetQueryResults(request)
   683      return response
   684  
   685    @retry.with_exponential_backoff(
   686        num_retries=MAX_RETRIES,
   687        retry_filter=retry.retry_on_server_errors_timeout_or_quota_issues_filter)
   688    def _insert_all_rows(
   689        self,
   690        project_id,
   691        dataset_id,
   692        table_id,
   693        rows,
   694        insert_ids,
   695        skip_invalid_rows=False,
   696        ignore_unknown_values=False):
   697      """Calls the insertAll BigQuery API endpoint.
   698  
   699      Docs for this BQ call: https://cloud.google.com/bigquery/docs/reference\
   700        /rest/v2/tabledata/insertAll."""
   701      # The rows argument is a list of
   702      # bigquery.TableDataInsertAllRequest.RowsValueListEntry instances as
   703      # required by the InsertAll() method.
   704      resource = resource_identifiers.BigQueryTable(
   705          project_id, dataset_id, table_id)
   706  
   707      labels = {
   708          # TODO(ajamato): Add Ptransform label.
   709          monitoring_infos.SERVICE_LABEL: 'BigQuery',
   710          # Refer to any method which writes elements to BigQuery in batches
   711          # as "BigQueryBatchWrite". I.e. storage API's insertAll, or future
   712          # APIs introduced.
   713          monitoring_infos.METHOD_LABEL: 'BigQueryBatchWrite',
   714          monitoring_infos.RESOURCE_LABEL: resource,
   715          monitoring_infos.BIGQUERY_PROJECT_ID_LABEL: project_id,
   716          monitoring_infos.BIGQUERY_DATASET_LABEL: dataset_id,
   717          monitoring_infos.BIGQUERY_TABLE_LABEL: table_id,
   718      }
   719      service_call_metric = ServiceCallMetric(
   720          request_count_urn=monitoring_infos.API_REQUEST_COUNT_URN,
   721          base_labels=labels)
   722  
   723      started_millis = int(time.time() * 1000)
   724      try:
   725        table_ref_str = '%s.%s.%s' % (project_id, dataset_id, table_id)
   726        errors = self.gcp_bq_client.insert_rows_json(
   727            table_ref_str,
   728            json_rows=rows,
   729            row_ids=insert_ids,
   730            skip_invalid_rows=skip_invalid_rows,
   731            ignore_unknown_values=ignore_unknown_values,
   732            timeout=BQ_STREAMING_INSERT_TIMEOUT_SEC)
   733        if not errors:
   734          service_call_metric.call('ok')
   735        else:
   736          for insert_error in errors:
   737            service_call_metric.call(insert_error['errors'][0])
   738      except (ClientError, GoogleAPICallError) as e:
   739        # e.code contains the numeric http status code.
   740        service_call_metric.call(e.code)
   741        # Re-reise the exception so that we re-try appropriately.
   742        raise
   743      except HttpError as e:
   744        service_call_metric.call(e)
   745        # Re-reise the exception so that we re-try appropriately.
   746        raise
   747      finally:
   748        self._latency_histogram_metric.update(
   749            int(time.time() * 1000) - started_millis)
   750      return not errors, errors
   751  
   752    @retry.with_exponential_backoff(
   753        num_retries=MAX_RETRIES,
   754        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   755    def get_table(self, project_id, dataset_id, table_id):
   756      """Lookup a table's metadata object.
   757  
   758      Args:
   759        client: bigquery.BigqueryV2 instance
   760        project_id: table lookup parameter
   761        dataset_id: table lookup parameter
   762        table_id: table lookup parameter
   763  
   764      Returns:
   765        bigquery.Table instance
   766      Raises:
   767        HttpError: if lookup failed.
   768      """
   769      request = bigquery.BigqueryTablesGetRequest(
   770          projectId=project_id, datasetId=dataset_id, tableId=table_id)
   771      response = self.client.tables.Get(request)
   772      return response
   773  
   774    def _create_table(
   775        self,
   776        project_id,
   777        dataset_id,
   778        table_id,
   779        schema,
   780        additional_parameters=None):
   781  
   782      valid_tablename = regex.fullmatch(_TABLE_PATTERN, table_id, regex.ASCII)
   783      if not valid_tablename:
   784        raise ValueError(
   785            'Invalid BigQuery table name: %s \n'
   786            'See https://cloud.google.com/bigquery/docs/tables#table_naming' %
   787            table_id)
   788  
   789      additional_parameters = additional_parameters or {}
   790      table = bigquery.Table(
   791          tableReference=TableReference(
   792              projectId=project_id, datasetId=dataset_id, tableId=table_id),
   793          schema=schema,
   794          **additional_parameters)
   795      request = bigquery.BigqueryTablesInsertRequest(
   796          projectId=project_id, datasetId=dataset_id, table=table)
   797      response = self.client.tables.Insert(request)
   798      _LOGGER.debug("Created the table with id %s", table_id)
   799      # The response is a bigquery.Table instance.
   800      return response
   801  
   802    @retry.with_exponential_backoff(
   803        num_retries=MAX_RETRIES,
   804        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   805    def get_or_create_dataset(
   806        self, project_id, dataset_id, location=None, labels=None):
   807      # Check if dataset already exists otherwise create it
   808      try:
   809        dataset = self.client.datasets.Get(
   810            bigquery.BigqueryDatasetsGetRequest(
   811                projectId=project_id, datasetId=dataset_id))
   812        self.created_temp_dataset = False
   813        return dataset
   814      except HttpError as exn:
   815        if exn.status_code == 404:
   816          _LOGGER.info(
   817              'Dataset %s:%s does not exist so we will create it as temporary '
   818              'with location=%s',
   819              project_id,
   820              dataset_id,
   821              location)
   822          dataset_reference = bigquery.DatasetReference(
   823              projectId=project_id, datasetId=dataset_id)
   824          dataset = bigquery.Dataset(datasetReference=dataset_reference)
   825          if location is not None:
   826            dataset.location = location
   827          if labels is not None:
   828            dataset.labels = _build_dataset_labels(labels)
   829          request = bigquery.BigqueryDatasetsInsertRequest(
   830              projectId=project_id, dataset=dataset)
   831          response = self.client.datasets.Insert(request)
   832          self.created_temp_dataset = True
   833          # The response is a bigquery.Dataset instance.
   834          return response
   835        else:
   836          raise
   837  
   838    @retry.with_exponential_backoff(
   839        num_retries=MAX_RETRIES,
   840        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   841    def _is_table_empty(self, project_id, dataset_id, table_id):
   842      request = bigquery.BigqueryTabledataListRequest(
   843          projectId=project_id,
   844          datasetId=dataset_id,
   845          tableId=table_id,
   846          maxResults=1)
   847      response = self.client.tabledata.List(request)
   848      # The response is a bigquery.TableDataList instance.
   849      return response.totalRows == 0
   850  
   851    @retry.with_exponential_backoff(
   852        num_retries=MAX_RETRIES,
   853        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   854    def _delete_table(self, project_id, dataset_id, table_id):
   855      request = bigquery.BigqueryTablesDeleteRequest(
   856          projectId=project_id, datasetId=dataset_id, tableId=table_id)
   857      try:
   858        self.client.tables.Delete(request)
   859      except HttpError as exn:
   860        if exn.status_code == 404:
   861          _LOGGER.warning(
   862              'Table %s:%s.%s does not exist', project_id, dataset_id, table_id)
   863          return
   864        else:
   865          raise
   866  
   867    @retry.with_exponential_backoff(
   868        num_retries=MAX_RETRIES,
   869        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   870    def _delete_dataset(self, project_id, dataset_id, delete_contents=True):
   871      request = bigquery.BigqueryDatasetsDeleteRequest(
   872          projectId=project_id,
   873          datasetId=dataset_id,
   874          deleteContents=delete_contents)
   875      try:
   876        self.client.datasets.Delete(request)
   877      except HttpError as exn:
   878        if exn.status_code == 404:
   879          _LOGGER.warning('Dataset %s:%s does not exist', project_id, dataset_id)
   880          return
   881        else:
   882          raise
   883  
   884    @retry.with_exponential_backoff(
   885        num_retries=MAX_RETRIES,
   886        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   887    def get_table_location(self, project_id, dataset_id, table_id):
   888      table = self.get_table(project_id, dataset_id, table_id)
   889      return table.location
   890  
   891    # Returns true if the temporary dataset was provided by the user.
   892    def is_user_configured_dataset(self):
   893      return (
   894          self.temp_dataset_id and
   895          not self.temp_dataset_id.startswith(self.TEMP_DATASET))
   896  
   897    @retry.with_exponential_backoff(
   898        num_retries=MAX_RETRIES,
   899        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   900    def create_temporary_dataset(self, project_id, location, labels=None):
   901      self.get_or_create_dataset(
   902          project_id, self.temp_dataset_id, location=location, labels=labels)
   903  
   904      if (project_id is not None and not self.is_user_configured_dataset() and
   905          not self.created_temp_dataset):
   906        # Unittests don't pass projectIds so they can be run without error
   907        # User configured datasets are allowed to pre-exist.
   908        raise RuntimeError(
   909            'Dataset %s:%s already exists so cannot be used as temporary.' %
   910            (project_id, self.temp_dataset_id))
   911  
   912    @retry.with_exponential_backoff(
   913        num_retries=MAX_RETRIES,
   914        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   915    def clean_up_temporary_dataset(self, project_id):
   916      temp_table = self._get_temp_table(project_id)
   917      try:
   918        self.client.datasets.Get(
   919            bigquery.BigqueryDatasetsGetRequest(
   920                projectId=project_id, datasetId=temp_table.datasetId))
   921      except HttpError as exn:
   922        if exn.status_code == 404:
   923          _LOGGER.warning(
   924              'Dataset %s:%s does not exist', project_id, temp_table.datasetId)
   925          return
   926        else:
   927          raise
   928      try:
   929        # We do not want to delete temporary datasets configured by the user hence
   930        # we just delete the temporary table in that case.
   931        if not self.is_user_configured_dataset():
   932          self._delete_dataset(temp_table.projectId, temp_table.datasetId, True)
   933        else:
   934          self._delete_table(
   935              temp_table.projectId, temp_table.datasetId, temp_table.tableId)
   936        self.created_temp_dataset = False
   937      except HttpError as exn:
   938        if exn.status_code == 403:
   939          _LOGGER.warning(
   940              'Permission denied to delete temporary dataset %s:%s for clean up',
   941              temp_table.projectId,
   942              temp_table.datasetId)
   943          return
   944        else:
   945          raise
   946  
   947    @retry.with_exponential_backoff(
   948        num_retries=MAX_RETRIES,
   949        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   950    def _clean_up_beam_labelled_temporary_datasets(
   951        self, project_id, dataset_id=None, table_id=None, labels=None):
   952      if isinstance(labels, dict):
   953        filter_str = _build_filter_from_labels(labels)
   954  
   955      if not self.is_user_configured_dataset() and labels is not None:
   956        response = (
   957            self.client.datasets.List(
   958                bigquery.BigqueryDatasetsListRequest(
   959                    projectId=project_id, filter=filter_str)))
   960        for dataset in response.datasets:
   961          try:
   962            dataset_id = dataset.datasetReference.datasetId
   963            self._delete_dataset(project_id, dataset_id, True)
   964          except HttpError as exn:
   965            if exn.status_code == 403:
   966              _LOGGER.warning(
   967                  'Permission denied to delete temporary dataset %s:%s for '
   968                  'clean up.',
   969                  project_id,
   970                  dataset_id)
   971              return
   972            else:
   973              raise
   974      else:
   975        try:
   976          self._delete_table(project_id, dataset_id, table_id)
   977        except HttpError as exn:
   978          if exn.status_code == 403:
   979            _LOGGER.warning(
   980                'Permission denied to delete temporary table %s:%s.%s for '
   981                'clean up.',
   982                project_id,
   983                dataset_id,
   984                table_id)
   985            return
   986          else:
   987            raise
   988  
   989    @retry.with_exponential_backoff(
   990        num_retries=MAX_RETRIES,
   991        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
   992    def get_job(self, project, job_id, location=None):
   993      request = bigquery.BigqueryJobsGetRequest()
   994      request.jobId = job_id
   995      request.projectId = project
   996      request.location = location
   997  
   998      return self.client.jobs.Get(request)
   999  
  1000    def perform_load_job(
  1001        self,
  1002        destination,
  1003        job_id,
  1004        source_uris=None,
  1005        source_stream=None,
  1006        schema=None,
  1007        write_disposition=None,
  1008        create_disposition=None,
  1009        additional_load_parameters=None,
  1010        source_format=None,
  1011        job_labels=None,
  1012        load_job_project_id=None):
  1013      """Starts a job to load data into BigQuery.
  1014  
  1015      Returns:
  1016        bigquery.JobReference with the information about the job that was started.
  1017      """
  1018      project_id = (
  1019          destination.projectId
  1020          if load_job_project_id is None else load_job_project_id)
  1021  
  1022      return self._insert_load_job(
  1023          project_id,
  1024          job_id,
  1025          destination,
  1026          source_uris=source_uris,
  1027          source_stream=source_stream,
  1028          schema=schema,
  1029          create_disposition=create_disposition,
  1030          write_disposition=write_disposition,
  1031          additional_load_parameters=additional_load_parameters,
  1032          source_format=source_format,
  1033          job_labels=job_labels)
  1034  
  1035    @retry.with_exponential_backoff(
  1036        num_retries=MAX_RETRIES,
  1037        retry_filter=retry.retry_on_server_errors_and_timeout_filter)
  1038    def perform_extract_job(
  1039        self,
  1040        destination,
  1041        job_id,
  1042        table_reference,
  1043        destination_format,
  1044        project=None,
  1045        include_header=True,
  1046        compression=ExportCompression.NONE,
  1047        use_avro_logical_types=False,
  1048        job_labels=None):
  1049      """Starts a job to export data from BigQuery.
  1050  
  1051      Returns:
  1052        bigquery.JobReference with the information about the job that was started.
  1053      """
  1054      job_project = project or table_reference.projectId
  1055      job_reference = bigquery.JobReference(jobId=job_id, projectId=job_project)
  1056      request = bigquery.BigqueryJobsInsertRequest(
  1057          projectId=job_project,
  1058          job=bigquery.Job(
  1059              configuration=bigquery.JobConfiguration(
  1060                  extract=bigquery.JobConfigurationExtract(
  1061                      destinationUris=destination,
  1062                      sourceTable=table_reference,
  1063                      printHeader=include_header,
  1064                      destinationFormat=destination_format,
  1065                      compression=compression,
  1066                      useAvroLogicalTypes=use_avro_logical_types,
  1067                  ),
  1068                  labels=_build_job_labels(job_labels),
  1069              ),
  1070              jobReference=job_reference,
  1071          ))
  1072      return self._start_job(request).jobReference
  1073  
  1074    @retry.with_exponential_backoff(
  1075        num_retries=MAX_RETRIES,
  1076        retry_filter=retry.
  1077        retry_if_valid_input_but_server_error_and_timeout_filter)
  1078    def get_or_create_table(
  1079        self,
  1080        project_id,
  1081        dataset_id,
  1082        table_id,
  1083        schema,
  1084        create_disposition,
  1085        write_disposition,
  1086        additional_create_parameters=None):
  1087      """Gets or creates a table based on create and write dispositions.
  1088  
  1089      The function mimics the behavior of BigQuery import jobs when using the
  1090      same create and write dispositions.
  1091  
  1092      Args:
  1093        project_id: The project id owning the table.
  1094        dataset_id: The dataset id owning the table.
  1095        table_id: The table id.
  1096        schema: A bigquery.TableSchema instance or None.
  1097        create_disposition: CREATE_NEVER or CREATE_IF_NEEDED.
  1098        write_disposition: WRITE_APPEND, WRITE_EMPTY or WRITE_TRUNCATE.
  1099  
  1100      Returns:
  1101        A bigquery.Table instance if table was found or created.
  1102  
  1103      Raises:
  1104        `RuntimeError`: For various mismatches between the state of the table and
  1105          the create/write dispositions passed in. For example if the table is not
  1106          empty and WRITE_EMPTY was specified then an error will be raised since
  1107          the table was expected to be empty.
  1108      """
  1109      from apache_beam.io.gcp.bigquery import BigQueryDisposition
  1110  
  1111      found_table = None
  1112      try:
  1113        found_table = self.get_table(project_id, dataset_id, table_id)
  1114      except HttpError as exn:
  1115        if exn.status_code == 404:
  1116          if create_disposition == BigQueryDisposition.CREATE_NEVER:
  1117            raise RuntimeError(
  1118                'Table %s:%s.%s not found but create disposition is CREATE_NEVER.'
  1119                % (project_id, dataset_id, table_id))
  1120        else:
  1121          raise
  1122  
  1123      # If table exists already then handle the semantics for WRITE_EMPTY and
  1124      # WRITE_TRUNCATE write dispositions.
  1125      if found_table and write_disposition in (
  1126          BigQueryDisposition.WRITE_EMPTY, BigQueryDisposition.WRITE_TRUNCATE):
  1127        # Delete the table and recreate it (later) if WRITE_TRUNCATE was
  1128        # specified.
  1129        if write_disposition == BigQueryDisposition.WRITE_TRUNCATE:
  1130          self._delete_table(project_id, dataset_id, table_id)
  1131        elif (write_disposition == BigQueryDisposition.WRITE_EMPTY and
  1132              not self._is_table_empty(project_id, dataset_id, table_id)):
  1133          raise RuntimeError(
  1134              'Table %s:%s.%s is not empty but write disposition is WRITE_EMPTY.'
  1135              % (project_id, dataset_id, table_id))
  1136  
  1137      # Create a new table potentially reusing the schema from a previously
  1138      # found table in case the schema was not specified.
  1139      if schema is None and found_table is None:
  1140        raise RuntimeError(
  1141            'Table %s:%s.%s requires a schema. None can be inferred because the '
  1142            'table does not exist.' % (project_id, dataset_id, table_id))
  1143      if found_table and write_disposition != BigQueryDisposition.WRITE_TRUNCATE:
  1144        return found_table
  1145      else:
  1146        created_table = None
  1147        try:
  1148          created_table = self._create_table(
  1149              project_id=project_id,
  1150              dataset_id=dataset_id,
  1151              table_id=table_id,
  1152              schema=schema or found_table.schema,
  1153              additional_parameters=additional_create_parameters)
  1154        except HttpError as exn:
  1155          if exn.status_code == 409:
  1156            _LOGGER.debug(
  1157                'Skipping Creation. Table %s:%s.%s already exists.' %
  1158                (project_id, dataset_id, table_id))
  1159            created_table = self.get_table(project_id, dataset_id, table_id)
  1160          else:
  1161            raise
  1162        _LOGGER.info(
  1163            'Created table %s.%s.%s with schema %s. '
  1164            'Result: %s.',
  1165            project_id,
  1166            dataset_id,
  1167            table_id,
  1168            schema or found_table.schema,
  1169            created_table)
  1170        # if write_disposition == BigQueryDisposition.WRITE_TRUNCATE we delete
  1171        # the table before this point.
  1172        if write_disposition == BigQueryDisposition.WRITE_TRUNCATE:
  1173          # BigQuery can route data to the old table for 2 mins max so wait
  1174          # that much time before creating the table and writing it
  1175          _LOGGER.warning(
  1176              'Sleeping for 150 seconds before the write as ' +
  1177              'BigQuery inserts can be routed to deleted table ' +
  1178              'for 2 mins after the delete and create.')
  1179          # TODO(BEAM-2673): Remove this sleep by migrating to load api
  1180          time.sleep(150)
  1181          return created_table
  1182        else:
  1183          return created_table
  1184  
  1185    def run_query(
  1186        self,
  1187        project_id,
  1188        query,
  1189        use_legacy_sql,
  1190        flatten_results,
  1191        priority,
  1192        dry_run=False,
  1193        job_labels=None):
  1194      job = self._start_query_job(
  1195          project_id,
  1196          query,
  1197          use_legacy_sql,
  1198          flatten_results,
  1199          job_id=uuid.uuid4().hex,
  1200          priority=priority,
  1201          dry_run=dry_run,
  1202          job_labels=job_labels)
  1203      job_id = job.jobReference.jobId
  1204      location = job.jobReference.location
  1205  
  1206      if dry_run:
  1207        # If this was a dry run then the fact that we get here means the
  1208        # query has no errors. The start_query_job would raise an error otherwise.
  1209        return
  1210      page_token = None
  1211      while True:
  1212        response = self._get_query_results(
  1213            project_id, job_id, page_token, location=location)
  1214        if not response.jobComplete:
  1215          # The jobComplete field can be False if the query request times out
  1216          # (default is 10 seconds). Note that this is a timeout for the query
  1217          # request not for the actual execution of the query in the service.  If
  1218          # the request times out we keep trying. This situation is quite possible
  1219          # if the query will return a large number of rows.
  1220          _LOGGER.info('Waiting on response from query: %s ...', query)
  1221          time.sleep(1.0)
  1222          continue
  1223        # We got some results. The last page is signalled by a missing pageToken.
  1224        yield response.rows, response.schema
  1225        if not response.pageToken:
  1226          break
  1227        page_token = response.pageToken
  1228  
  1229    def insert_rows(
  1230        self,
  1231        project_id,
  1232        dataset_id,
  1233        table_id,
  1234        rows,
  1235        insert_ids=None,
  1236        skip_invalid_rows=False,
  1237        ignore_unknown_values=False):
  1238      """Inserts rows into the specified table.
  1239  
  1240      Args:
  1241        project_id: The project id owning the table.
  1242        dataset_id: The dataset id owning the table.
  1243        table_id: The table id.
  1244        rows: A list of plain Python dictionaries. Each dictionary is a row and
  1245          each key in it is the name of a field.
  1246        skip_invalid_rows: If there are rows with insertion errors, whether they
  1247          should be skipped, and all others should be inserted successfully.
  1248        ignore_unknown_values: Set this option to true to ignore unknown column
  1249          names. If the input rows contain columns that are not
  1250          part of the existing table's schema, those columns are ignored, and
  1251          the rows are successfully inserted.
  1252  
  1253      Returns:
  1254        A tuple (bool, errors). If first element is False then the second element
  1255        will be a bigquery.InsertErrorsValueListEntry instance containing
  1256        specific errors.
  1257      """
  1258  
  1259      # Prepare rows for insertion. Of special note is the row ID that we add to
  1260      # each row in order to help BigQuery avoid inserting a row multiple times.
  1261      # BigQuery will do a best-effort if unique IDs are provided. This situation
  1262      # can happen during retries on failures.
  1263      # TODO(silviuc): Must add support to writing TableRow's instead of dicts.
  1264      insert_ids = [
  1265          str(self.unique_row_id) if not insert_ids else insert_ids[i] for i,
  1266          _ in enumerate(rows)
  1267      ]
  1268      rows = [
  1269          fast_json_loads(fast_json_dumps(r, default=default_encoder))
  1270          for r in rows
  1271      ]
  1272  
  1273      result, errors = self._insert_all_rows(
  1274          project_id, dataset_id, table_id, rows, insert_ids,
  1275          skip_invalid_rows=skip_invalid_rows,
  1276          ignore_unknown_values=ignore_unknown_values)
  1277      return result, errors
  1278  
  1279    def _convert_cell_value_to_dict(self, value, field):
  1280      if field.type == 'STRING':
  1281        # Input: "XYZ" --> Output: "XYZ"
  1282        return value
  1283      elif field.type == 'BOOLEAN':
  1284        # Input: "true" --> Output: True
  1285        return value == 'true'
  1286      elif field.type == 'INTEGER':
  1287        # Input: "123" --> Output: 123
  1288        return int(value)
  1289      elif field.type == 'FLOAT':
  1290        # Input: "1.23" --> Output: 1.23
  1291        return float(value)
  1292      elif field.type == 'TIMESTAMP':
  1293        # The UTC should come from the timezone library but this is a known
  1294        # issue in python 2.7 so we'll just hardcode it as we're reading using
  1295        # utcfromtimestamp.
  1296        # Input: 1478134176.985864 --> Output: "2016-11-03 00:49:36.985864 UTC"
  1297        dt = datetime.datetime.utcfromtimestamp(float(value))
  1298        return dt.strftime('%Y-%m-%d %H:%M:%S.%f UTC')
  1299      elif field.type == 'BYTES':
  1300        # Input: "YmJi" --> Output: "YmJi"
  1301        return value
  1302      elif field.type == 'DATE':
  1303        # Input: "2016-11-03" --> Output: "2016-11-03"
  1304        return value
  1305      elif field.type == 'DATETIME':
  1306        # Input: "2016-11-03T00:49:36" --> Output: "2016-11-03T00:49:36"
  1307        return value
  1308      elif field.type == 'TIME':
  1309        # Input: "00:49:36" --> Output: "00:49:36"
  1310        return value
  1311      elif field.type == 'RECORD':
  1312        # Note that a schema field object supports also a RECORD type. However
  1313        # when querying, the repeated and/or record fields are flattened
  1314        # unless we pass the flatten_results flag as False to the source
  1315        return self.convert_row_to_dict(value, field)
  1316      elif field.type == 'NUMERIC':
  1317        return decimal.Decimal(value)
  1318      elif field.type == 'GEOGRAPHY':
  1319        return value
  1320      else:
  1321        raise RuntimeError('Unexpected field type: %s' % field.type)
  1322  
  1323    def convert_row_to_dict(self, row, schema):
  1324      """Converts a TableRow instance using the schema to a Python dict."""
  1325      result = {}
  1326      for index, field in enumerate(schema.fields):
  1327        value = None
  1328        if isinstance(schema, bigquery.TableSchema):
  1329          cell = row.f[index]
  1330          value = from_json_value(cell.v) if cell.v is not None else None
  1331        elif isinstance(schema, bigquery.TableFieldSchema):
  1332          cell = row['f'][index]
  1333          value = cell['v'] if 'v' in cell else None
  1334        if field.mode == 'REPEATED':
  1335          if value is None:
  1336            # Ideally this should never happen as repeated fields default to
  1337            # returning an empty list
  1338            result[field.name] = []
  1339          else:
  1340            result[field.name] = [
  1341                self._convert_cell_value_to_dict(x['v'], field) for x in value
  1342            ]
  1343        elif value is None:
  1344          if not field.mode == 'NULLABLE':
  1345            raise ValueError(
  1346                'Received \'None\' as the value for the field %s '
  1347                'but the field is not NULLABLE.' % field.name)
  1348          result[field.name] = None
  1349        else:
  1350          result[field.name] = self._convert_cell_value_to_dict(value, field)
  1351      return result
  1352  
  1353  
  1354  class RowAsDictJsonCoder(coders.Coder):
  1355    """A coder for a table row (represented as a dict) to/from a JSON string.
  1356  
  1357    This is the default coder for sources and sinks if the coder argument is not
  1358    specified.
  1359    """
  1360    def encode(self, table_row):
  1361      # The normal error when dumping NAN/INF values is:
  1362      # ValueError: Out of range float values are not JSON compliant
  1363      # This code will catch this error to emit an error that explains
  1364      # to the programmer that they have used NAN/INF values.
  1365      try:
  1366        return json.dumps(
  1367            table_row,
  1368            allow_nan=False,
  1369            ensure_ascii=False,
  1370            default=default_encoder).encode('utf-8')
  1371      except ValueError as e:
  1372        raise ValueError(
  1373            '%s. %s. Row: %r' % (e, JSON_COMPLIANCE_ERROR, table_row))
  1374  
  1375    def decode(self, encoded_table_row):
  1376      return json.loads(encoded_table_row.decode('utf-8'))
  1377  
  1378    def to_type_hint(self):
  1379      return Any
  1380  
  1381  
  1382  class JsonRowWriter(io.IOBase):
  1383    """
  1384    A writer which provides an IOBase-like interface for writing table rows
  1385    (represented as dicts) as newline-delimited JSON strings.
  1386    """
  1387    def __init__(self, file_handle):
  1388      """Initialize an JsonRowWriter.
  1389  
  1390      Args:
  1391        file_handle (io.IOBase): Output stream to write to.
  1392      """
  1393      if not file_handle.writable():
  1394        raise ValueError("Output stream must be writable")
  1395  
  1396      self._file_handle = file_handle
  1397      self._coder = RowAsDictJsonCoder()
  1398  
  1399    def close(self):
  1400      self._file_handle.close()
  1401  
  1402    @property
  1403    def closed(self):
  1404      return self._file_handle.closed
  1405  
  1406    def flush(self):
  1407      self._file_handle.flush()
  1408  
  1409    def read(self, size=-1):
  1410      raise io.UnsupportedOperation("JsonRowWriter is not readable")
  1411  
  1412    def tell(self):
  1413      return self._file_handle.tell()
  1414  
  1415    def writable(self):
  1416      return self._file_handle.writable()
  1417  
  1418    def write(self, row):
  1419      return self._file_handle.write(self._coder.encode(row) + b'\n')
  1420  
  1421  
  1422  class AvroRowWriter(io.IOBase):
  1423    """
  1424    A writer which provides an IOBase-like interface for writing table rows
  1425    (represented as dicts) as Avro records.
  1426    """
  1427    def __init__(self, file_handle, schema):
  1428      """Initialize an AvroRowWriter.
  1429  
  1430      Args:
  1431        file_handle (io.IOBase): Output stream to write Avro records to.
  1432        schema (Dict[Text, Any]): BigQuery table schema.
  1433      """
  1434      if not file_handle.writable():
  1435        raise ValueError("Output stream must be writable")
  1436  
  1437      self._file_handle = file_handle
  1438      avro_schema = fastavro.parse_schema(
  1439          get_avro_schema_from_table_schema(schema))
  1440      self._avro_writer = fastavro.write.Writer(self._file_handle, avro_schema)
  1441  
  1442    def close(self):
  1443      if not self._file_handle.closed:
  1444        self.flush()
  1445        self._file_handle.close()
  1446  
  1447    @property
  1448    def closed(self):
  1449      return self._file_handle.closed
  1450  
  1451    def flush(self):
  1452      if self._file_handle.closed:
  1453        raise ValueError("flush on closed file")
  1454  
  1455      self._avro_writer.flush()
  1456      self._file_handle.flush()
  1457  
  1458    def read(self, size=-1):
  1459      raise io.UnsupportedOperation("AvroRowWriter is not readable")
  1460  
  1461    def tell(self):
  1462      # Flush the fastavro Writer to the underlying stream, otherwise there isn't
  1463      # a reliable way to determine how many bytes have been written.
  1464      self._avro_writer.flush()
  1465      return self._file_handle.tell()
  1466  
  1467    def writable(self):
  1468      return self._file_handle.writable()
  1469  
  1470    def write(self, row):
  1471      try:
  1472        self._avro_writer.write(row)
  1473      except (TypeError, ValueError) as ex:
  1474        _, _, tb = sys.exc_info()
  1475        raise ex.__class__(
  1476            "Error writing row to Avro: {}\nSchema: {}\nRow: {}".format(
  1477                ex, self._avro_writer.schema, row)).with_traceback(tb)
  1478  
  1479  
  1480  class RetryStrategy(object):
  1481    RETRY_ALWAYS = 'RETRY_ALWAYS'
  1482    RETRY_NEVER = 'RETRY_NEVER'
  1483    RETRY_ON_TRANSIENT_ERROR = 'RETRY_ON_TRANSIENT_ERROR'
  1484  
  1485    _NON_TRANSIENT_ERRORS = {'invalid', 'invalidQuery', 'notImplemented'}
  1486  
  1487    @staticmethod
  1488    def should_retry(strategy, error_message):
  1489      if strategy == RetryStrategy.RETRY_ALWAYS:
  1490        return True
  1491      elif strategy == RetryStrategy.RETRY_NEVER:
  1492        return False
  1493      elif (strategy == RetryStrategy.RETRY_ON_TRANSIENT_ERROR and
  1494            error_message not in RetryStrategy._NON_TRANSIENT_ERRORS):
  1495        return True
  1496      else:
  1497        return False
  1498  
  1499  
  1500  class AppendDestinationsFn(DoFn):
  1501    """Adds the destination to an element, making it a KV pair.
  1502  
  1503    Outputs a PCollection of KV-pairs where the key is a TableReference for the
  1504    destination, and the value is the record itself.
  1505  
  1506    Experimental; no backwards compatibility guarantees.
  1507    """
  1508    def __init__(self, destination):
  1509      self._display_destination = destination
  1510      self.destination = AppendDestinationsFn._get_table_fn(destination)
  1511  
  1512    def display_data(self):
  1513      return {'destination': str(self._display_destination)}
  1514  
  1515    @staticmethod
  1516    def _value_provider_or_static_val(elm):
  1517      if isinstance(elm, value_provider.ValueProvider):
  1518        return elm
  1519      else:
  1520        # The type argument is a NoOp, because we assume the argument already has
  1521        # the proper formatting.
  1522        return value_provider.StaticValueProvider(lambda x: x, value=elm)
  1523  
  1524    @staticmethod
  1525    def _get_table_fn(destination):
  1526      if callable(destination):
  1527        return destination
  1528      else:
  1529        return lambda x: AppendDestinationsFn._value_provider_or_static_val(
  1530            destination).get()
  1531  
  1532    def process(self, element, *side_inputs):
  1533      yield (self.destination(element, *side_inputs), element)
  1534  
  1535  
  1536  def beam_row_from_dict(row: dict, schema):
  1537    """Converts a dictionary row to a Beam Row.
  1538    Nested records and lists are supported.
  1539  
  1540    Args:
  1541      row (dict):
  1542        The row to convert.
  1543      schema (str, dict, ~apache_beam.io.gcp.internal.clients.bigquery.\
  1544  bigquery_v2_messages.TableSchema):
  1545        The table schema. Will be used to help convert the row.
  1546  
  1547    Returns:
  1548      ~apache_beam.pvalue.Row: The converted row.
  1549    """
  1550    if not isinstance(schema, (bigquery.TableSchema, bigquery.TableFieldSchema)):
  1551      schema = get_bq_tableschema(schema)
  1552    schema_fields = {field.name: field for field in schema.fields}
  1553    beam_row = {}
  1554    for col_name, value in row.items():
  1555      # get this column's schema field and handle struct types
  1556      field = schema_fields[col_name]
  1557      if field.type.upper() in ["RECORD", "STRUCT"]:
  1558        # if this is a list of records, we create a list of Beam Rows
  1559        if field.mode.upper() == "REPEATED":
  1560          list_of_beam_rows = []
  1561          for record in value:
  1562            list_of_beam_rows.append(beam_row_from_dict(record, field))
  1563          beam_row[col_name] = list_of_beam_rows
  1564        # otherwise, create a Beam Row from this record
  1565        else:
  1566          beam_row[col_name] = beam_row_from_dict(value, field)
  1567      else:
  1568        beam_row[col_name] = value
  1569    return apache_beam.pvalue.Row(**beam_row)
  1570  
  1571  
  1572  def get_table_schema_from_string(schema):
  1573    """Transform the string table schema into a
  1574    :class:`~apache_beam.io.gcp.internal.clients.bigquery.\
  1575  bigquery_v2_messages.TableSchema` instance.
  1576  
  1577    Args:
  1578      schema (str): The sting schema to be used if the BigQuery table to write
  1579        has to be created.
  1580  
  1581    Returns:
  1582      ~apache_beam.io.gcp.internal.clients.bigquery.\
  1583  bigquery_v2_messages.TableSchema:
  1584      The schema to be used if the BigQuery table to write has to be created
  1585      but in the :class:`~apache_beam.io.gcp.internal.clients.bigquery.\
  1586  bigquery_v2_messages.TableSchema` format.
  1587    """
  1588    table_schema = bigquery.TableSchema()
  1589    schema_list = [s.strip() for s in schema.split(',')]
  1590    for field_and_type in schema_list:
  1591      field_name, field_type = field_and_type.split(':')
  1592      field_schema = bigquery.TableFieldSchema()
  1593      field_schema.name = field_name
  1594      field_schema.type = field_type
  1595      field_schema.mode = 'NULLABLE'
  1596      table_schema.fields.append(field_schema)
  1597    return table_schema
  1598  
  1599  
  1600  def table_schema_to_dict(table_schema):
  1601    """Create a dictionary representation of table schema for serialization
  1602    """
  1603    def get_table_field(field):
  1604      """Create a dictionary representation of a table field
  1605      """
  1606      result = {}
  1607      result['name'] = field.name
  1608      result['type'] = field.type
  1609      result['mode'] = getattr(field, 'mode', 'NULLABLE')
  1610      if hasattr(field, 'description') and field.description is not None:
  1611        result['description'] = field.description
  1612      if hasattr(field, 'fields') and field.fields:
  1613        result['fields'] = [get_table_field(f) for f in field.fields]
  1614      return result
  1615  
  1616    if not isinstance(table_schema, bigquery.TableSchema):
  1617      raise ValueError("Table schema must be of the type bigquery.TableSchema")
  1618    schema = {'fields': []}
  1619    for field in table_schema.fields:
  1620      schema['fields'].append(get_table_field(field))
  1621    return schema
  1622  
  1623  
  1624  def get_dict_table_schema(schema):
  1625    """Transform the table schema into a dictionary instance.
  1626  
  1627    Args:
  1628      schema (str, dict, ~apache_beam.io.gcp.internal.clients.bigquery.\
  1629  bigquery_v2_messages.TableSchema):
  1630        The schema to be used if the BigQuery table to write has to be created.
  1631        This can either be a dict or string or in the TableSchema format.
  1632  
  1633    Returns:
  1634      Dict[str, Any]: The schema to be used if the BigQuery table to write has
  1635      to be created but in the dictionary format.
  1636    """
  1637    if (isinstance(schema, (dict, value_provider.ValueProvider)) or
  1638        callable(schema) or schema is None):
  1639      return schema
  1640    elif isinstance(schema, str):
  1641      table_schema = get_table_schema_from_string(schema)
  1642      return table_schema_to_dict(table_schema)
  1643    elif isinstance(schema, bigquery.TableSchema):
  1644      return table_schema_to_dict(schema)
  1645    else:
  1646      raise TypeError('Unexpected schema argument: %s.' % schema)
  1647  
  1648  
  1649  def get_bq_tableschema(schema):
  1650    """Convert the table schema to a TableSchema object.
  1651  
  1652    Args:
  1653      schema (str, dict, ~apache_beam.io.gcp.internal.clients.bigquery.\
  1654  bigquery_v2_messages.TableSchema):
  1655        The schema to be used if the BigQuery table to write has to be created.
  1656        This can either be a dict or string or in the TableSchema format.
  1657  
  1658    Returns:
  1659      ~apache_beam.io.gcp.internal.clients.bigquery.\
  1660  bigquery_v2_messages.TableSchema: The schema as a TableSchema object.
  1661    """
  1662    if (isinstance(schema,
  1663                   (bigquery.TableSchema, value_provider.ValueProvider)) or
  1664        callable(schema) or schema is None):
  1665      return schema
  1666    elif isinstance(schema, str):
  1667      return get_table_schema_from_string(schema)
  1668    elif isinstance(schema, dict):
  1669      schema_string = json.dumps(schema)
  1670      return parse_table_schema_from_json(schema_string)
  1671    else:
  1672      raise TypeError('Unexpected schema argument: %s.' % schema)
  1673  
  1674  
  1675  def get_avro_schema_from_table_schema(schema):
  1676    """Transform the table schema into an Avro schema.
  1677  
  1678    Args:
  1679      schema (str, dict, ~apache_beam.io.gcp.internal.clients.bigquery.\
  1680  bigquery_v2_messages.TableSchema):
  1681        The TableSchema to convert to Avro schema. This can either be a dict or
  1682        string or in the TableSchema format.
  1683  
  1684    Returns:
  1685      Dict[str, Any]: An Avro schema, which can be used by fastavro.
  1686    """
  1687    dict_table_schema = get_dict_table_schema(schema)
  1688    return bigquery_avro_tools.get_record_schema_from_dict_table_schema(
  1689        "root", dict_table_schema)
  1690  
  1691  
  1692  def get_beam_typehints_from_tableschema(schema):
  1693    """Extracts Beam Python type hints from the schema.
  1694  
  1695    Args:
  1696      schema (~apache_beam.io.gcp.internal.clients.bigquery.\
  1697  bigquery_v2_messages.TableSchema):
  1698        The TableSchema to extract type hints from.
  1699  
  1700    Returns:
  1701      List[Tuple[str, Any]]: A list of type hints that describe the input schema.
  1702      Nested and repeated fields are supported.
  1703    """
  1704    if not isinstance(schema, (bigquery.TableSchema, bigquery.TableFieldSchema)):
  1705      schema = get_bq_tableschema(schema)
  1706    typehints = []
  1707    for field in schema.fields:
  1708      name, field_type, mode = field.name, field.type.upper(), field.mode.upper()
  1709  
  1710      if field_type in ["STRUCT", "RECORD"]:
  1711        # Structs can be represented as Beam Rows.
  1712        typehint = RowTypeConstraint.from_fields(
  1713            get_beam_typehints_from_tableschema(field))
  1714      elif field_type in BIGQUERY_TYPE_TO_PYTHON_TYPE:
  1715        typehint = BIGQUERY_TYPE_TO_PYTHON_TYPE[field_type]
  1716      else:
  1717        raise ValueError(
  1718            f"Converting BigQuery type [{field_type}] to "
  1719            "Python Beam type is not supported.")
  1720  
  1721      if mode == "REPEATED":
  1722        typehint = Sequence[typehint]
  1723      elif mode != "REQUIRED":
  1724        typehint = Optional[typehint]
  1725  
  1726      typehints.append((name, typehint))
  1727    return typehints
  1728  
  1729  
  1730  class BigQueryJobTypes:
  1731    EXPORT = 'EXPORT'
  1732    COPY = 'COPY'
  1733    LOAD = 'LOAD'
  1734    QUERY = 'QUERY'
  1735  
  1736  
  1737  def generate_bq_job_name(job_name, step_id, job_type, random=None):
  1738    from apache_beam.io.gcp.bigquery import BQ_JOB_NAME_TEMPLATE
  1739    random = ("_%s" % random) if random else ""
  1740    return str.format(
  1741        BQ_JOB_NAME_TEMPLATE,
  1742        job_type=job_type,
  1743        job_id=job_name.replace("-", ""),
  1744        step_id=step_id,
  1745        random=random)
  1746  
  1747  
  1748  def check_schema_equal(
  1749      left, right, *, ignore_descriptions=False, ignore_field_order=False):
  1750    # type: (Union[bigquery.TableSchema, bigquery.TableFieldSchema], Union[bigquery.TableSchema, bigquery.TableFieldSchema], bool, bool) -> bool
  1751  
  1752    """Check whether schemas are equivalent.
  1753  
  1754    This comparison function differs from using == to compare TableSchema
  1755    because it ignores categories, policy tags, descriptions (optionally), and
  1756    field ordering (optionally).
  1757  
  1758    Args:
  1759      left (~apache_beam.io.gcp.internal.clients.bigquery.\
  1760  bigquery_v2_messages.TableSchema, ~apache_beam.io.gcp.internal.clients.\
  1761  bigquery.bigquery_v2_messages.TableFieldSchema):
  1762        One schema to compare.
  1763      right (~apache_beam.io.gcp.internal.clients.bigquery.\
  1764  bigquery_v2_messages.TableSchema, ~apache_beam.io.gcp.internal.clients.\
  1765  bigquery.bigquery_v2_messages.TableFieldSchema):
  1766        The other schema to compare.
  1767      ignore_descriptions (bool): (optional) Whether or not to ignore field
  1768        descriptions when comparing. Defaults to False.
  1769      ignore_field_order (bool): (optional) Whether or not to ignore struct field
  1770        order when comparing. Defaults to False.
  1771  
  1772    Returns:
  1773      bool: True if the schemas are equivalent, False otherwise.
  1774    """
  1775    if type(left) != type(right) or not isinstance(
  1776        left, (bigquery.TableSchema, bigquery.TableFieldSchema)):
  1777      return False
  1778  
  1779    if isinstance(left, bigquery.TableFieldSchema):
  1780      if left.name != right.name:
  1781        return False
  1782  
  1783      if left.type != right.type:
  1784        # Check for type aliases
  1785        if sorted(
  1786            (left.type, right.type)) not in (["BOOL", "BOOLEAN"], ["FLOAT",
  1787                                                                   "FLOAT64"],
  1788                                             ["INT64", "INTEGER"], ["RECORD",
  1789                                                                    "STRUCT"]):
  1790          return False
  1791  
  1792      if left.mode != right.mode:
  1793        return False
  1794  
  1795      if not ignore_descriptions and left.description != right.description:
  1796        return False
  1797  
  1798    if isinstance(left,
  1799                  bigquery.TableSchema) or left.type in ("RECORD", "STRUCT"):
  1800      if len(left.fields) != len(right.fields):
  1801        return False
  1802  
  1803      if ignore_field_order:
  1804        left_fields = sorted(left.fields, key=lambda field: field.name)
  1805        right_fields = sorted(right.fields, key=lambda field: field.name)
  1806      else:
  1807        left_fields = left.fields
  1808        right_fields = right.fields
  1809  
  1810      for left_field, right_field in zip(left_fields, right_fields):
  1811        if not check_schema_equal(left_field,
  1812                                  right_field,
  1813                                  ignore_descriptions=ignore_descriptions,
  1814                                  ignore_field_order=ignore_field_order):
  1815          return False
  1816  
  1817    return True