github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_read_internal.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_read_internal.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  Internal library for reading data from BigQuery.
    20  
    21  NOTHING IN THIS FILE HAS BACKWARDS COMPATIBILITY GUARANTEES.
    22  """
    23  import collections
    24  import decimal
    25  import json
    26  import logging
    27  import random
    28  import time
    29  import uuid
    30  from typing import TYPE_CHECKING
    31  from typing import Any
    32  from typing import Dict
    33  from typing import Iterable
    34  from typing import List
    35  from typing import Optional
    36  from typing import Union
    37  
    38  import apache_beam as beam
    39  from apache_beam.coders import coders
    40  from apache_beam.io.avroio import _create_avro_source
    41  from apache_beam.io.filesystem import CompressionTypes
    42  from apache_beam.io.filesystems import FileSystems
    43  from apache_beam.io.gcp import bigquery_tools
    44  from apache_beam.io.gcp.bigquery_io_metadata import create_bigquery_io_metadata
    45  from apache_beam.io.iobase import BoundedSource
    46  from apache_beam.io.textio import _TextSource
    47  from apache_beam.options.pipeline_options import GoogleCloudOptions
    48  from apache_beam.options.pipeline_options import PipelineOptions
    49  from apache_beam.options.value_provider import ValueProvider
    50  from apache_beam.transforms import PTransform
    51  
    52  if TYPE_CHECKING:
    53    from apache_beam.io.gcp.bigquery import ReadFromBigQueryRequest
    54  
    55  try:
    56    from apache_beam.io.gcp.internal.clients.bigquery import DatasetReference
    57    from apache_beam.io.gcp.internal.clients.bigquery import TableReference
    58  except ImportError:
    59    DatasetReference = None
    60    TableReference = None
    61  
    62  _LOGGER = logging.getLogger(__name__)
    63  
    64  
    65  def bigquery_export_destination_uri(
    66      gcs_location_vp: Optional[ValueProvider],
    67      temp_location: Optional[str],
    68      unique_id: str,
    69      directory_only: bool = False,
    70  ) -> str:
    71    """Returns the fully qualified Google Cloud Storage URI where the
    72    extracted table should be written.
    73    """
    74    file_pattern = 'bigquery-table-dump-*.json'
    75  
    76    gcs_location = None
    77    if gcs_location_vp is not None:
    78      gcs_location = gcs_location_vp.get()
    79  
    80    if gcs_location is not None:
    81      gcs_base = gcs_location
    82    elif temp_location is not None:
    83      gcs_base = temp_location
    84      _LOGGER.debug("gcs_location is empty, using temp_location instead")
    85    else:
    86      raise ValueError(
    87          'ReadFromBigQuery requires a GCS location to be provided. Neither '
    88          'gcs_location in the constructor nor the fallback option '
    89          '--temp_location is set.')
    90  
    91    if not unique_id:
    92      unique_id = uuid.uuid4().hex
    93  
    94    if directory_only:
    95      return FileSystems.join(gcs_base, unique_id)
    96    else:
    97      return FileSystems.join(gcs_base, unique_id, file_pattern)
    98  
    99  
   100  class _PassThroughThenCleanup(PTransform):
   101    """A PTransform that invokes a DoFn after the input PCollection has been
   102      processed.
   103  
   104      DoFn should have arguments (element, side_input, cleanup_signal).
   105  
   106      Utilizes readiness of PCollection to trigger DoFn.
   107    """
   108    def __init__(self, side_input=None):
   109      self.side_input = side_input
   110  
   111    def expand(self, input):
   112      class PassThrough(beam.DoFn):
   113        def process(self, element):
   114          yield element
   115  
   116      class RemoveExtractedFiles(beam.DoFn):
   117        def process(self, unused_element, unused_signal, gcs_locations):
   118          FileSystems.delete(list(gcs_locations))
   119  
   120      main_output, cleanup_signal = input | beam.ParDo(
   121          PassThrough()).with_outputs(
   122          'cleanup_signal', main='main')
   123  
   124      cleanup_input = input.pipeline | beam.Create([None])
   125  
   126      _ = cleanup_input | beam.ParDo(
   127          RemoveExtractedFiles(),
   128          beam.pvalue.AsSingleton(cleanup_signal),
   129          self.side_input,
   130      )
   131  
   132      return main_output
   133  
   134  
   135  class _PassThroughThenCleanupTempDatasets(PTransform):
   136    """A PTransform that invokes a DoFn after the input PCollection has been
   137      processed.
   138  
   139      DoFn should have arguments (element, side_input, cleanup_signal).
   140  
   141      Utilizes readiness of PCollection to trigger DoFn.
   142    """
   143    def __init__(self, side_input=None):
   144      self.side_input = side_input
   145  
   146    def expand(self, input):
   147      class PassThrough(beam.DoFn):
   148        def process(self, element):
   149          yield element
   150  
   151      class CleanUpProjects(beam.DoFn):
   152        def process(self, unused_element, unused_signal, pipeline_details):
   153          bq = bigquery_tools.BigQueryWrapper()
   154          pipeline_details = pipeline_details[0]
   155          if 'temp_table_ref' in pipeline_details.keys():
   156            temp_table_ref = pipeline_details['temp_table_ref']
   157            bq._clean_up_beam_labelled_temporary_datasets(
   158                project_id=temp_table_ref.projectId,
   159                dataset_id=temp_table_ref.datasetId,
   160                table_id=temp_table_ref.tableId)
   161          elif 'project_id' in pipeline_details.keys():
   162            bq._clean_up_beam_labelled_temporary_datasets(
   163                project_id=pipeline_details['project_id'],
   164                labels=pipeline_details['bigquery_dataset_labels'])
   165  
   166      main_output, cleanup_signal = input | beam.ParDo(
   167          PassThrough()).with_outputs(
   168          'cleanup_signal', main='main')
   169  
   170      cleanup_input = input.pipeline | beam.Create([None])
   171  
   172      _ = cleanup_input | beam.ParDo(
   173          CleanUpProjects(),
   174          beam.pvalue.AsSingleton(cleanup_signal),
   175          self.side_input,
   176      )
   177  
   178      return main_output
   179  
   180  
   181  class _BigQueryReadSplit(beam.transforms.DoFn):
   182    """Starts the process of reading from BigQuery.
   183  
   184    This transform will start a BigQuery export job, and output a number of
   185    file sources that are consumed downstream.
   186    """
   187    def __init__(
   188        self,
   189        options: PipelineOptions,
   190        gcs_location: Union[str, ValueProvider] = None,
   191        use_json_exports: bool = False,
   192        bigquery_job_labels: Dict[str, str] = None,
   193        step_name: str = None,
   194        job_name: str = None,
   195        unique_id: str = None,
   196        kms_key: str = None,
   197        project: str = None,
   198        temp_dataset: Union[str, DatasetReference] = None,
   199        query_priority: Optional[str] = None):
   200      self.options = options
   201      self.use_json_exports = use_json_exports
   202      self.gcs_location = gcs_location
   203      self.bigquery_job_labels = bigquery_job_labels or {}
   204      self._step_name = step_name
   205      self._job_name = job_name or 'BQ_READ_SPLIT'
   206      self._source_uuid = unique_id
   207      self.kms_key = kms_key
   208      self.project = project
   209      self.temp_dataset = temp_dataset or 'bq_read_all_%s' % uuid.uuid4().hex
   210      self.query_priority = query_priority
   211      self.bq_io_metadata = None
   212  
   213    def display_data(self):
   214      return {
   215          'use_json_exports': str(self.use_json_exports),
   216          'gcs_location': str(self.gcs_location),
   217          'bigquery_job_labels': json.dumps(self.bigquery_job_labels),
   218          'kms_key': str(self.kms_key),
   219          'project': str(self.project),
   220          'temp_dataset': str(self.temp_dataset)
   221      }
   222  
   223    def _get_temp_dataset(self):
   224      if isinstance(self.temp_dataset, str):
   225        return DatasetReference(
   226            datasetId=self.temp_dataset, projectId=self._get_project())
   227      else:
   228        return self.temp_dataset
   229  
   230    def process(self,
   231                element: 'ReadFromBigQueryRequest') -> Iterable[BoundedSource]:
   232      bq = bigquery_tools.BigQueryWrapper(
   233          temp_dataset_id=self._get_temp_dataset().datasetId)
   234  
   235      if element.query is not None:
   236        self._setup_temporary_dataset(bq, element)
   237        table_reference = self._execute_query(bq, element)
   238      else:
   239        assert element.table
   240        table_reference = bigquery_tools.parse_table_reference(
   241            element.table, project=self._get_project())
   242  
   243      if not table_reference.projectId:
   244        table_reference.projectId = self._get_project()
   245  
   246      schema, metadata_list = self._export_files(bq, element, table_reference)
   247  
   248      for metadata in metadata_list:
   249        yield self._create_source(metadata.path, schema)
   250  
   251      if element.query is not None:
   252        bq._delete_table(
   253            table_reference.projectId,
   254            table_reference.datasetId,
   255            table_reference.tableId)
   256  
   257      if bq.created_temp_dataset:
   258        self._clean_temporary_dataset(bq, element)
   259  
   260    def _get_bq_metadata(self):
   261      if not self.bq_io_metadata:
   262        self.bq_io_metadata = create_bigquery_io_metadata(self._step_name)
   263      return self.bq_io_metadata
   264  
   265    def _create_source(self, path, schema):
   266      if not self.use_json_exports:
   267        return _create_avro_source(path)
   268      else:
   269        return _TextSource(
   270            path,
   271            min_bundle_size=0,
   272            compression_type=CompressionTypes.UNCOMPRESSED,
   273            strip_trailing_newlines=True,
   274            coder=_JsonToDictCoder(schema))
   275  
   276    def _setup_temporary_dataset(
   277        self,
   278        bq: bigquery_tools.BigQueryWrapper,
   279        element: 'ReadFromBigQueryRequest'):
   280      location = bq.get_query_location(
   281          self._get_project(), element.query, not element.use_standard_sql)
   282      bq.create_temporary_dataset(self._get_project(), location)
   283  
   284    def _clean_temporary_dataset(
   285        self,
   286        bq: bigquery_tools.BigQueryWrapper,
   287        element: 'ReadFromBigQueryRequest'):
   288      bq.clean_up_temporary_dataset(self._get_project())
   289  
   290    def _execute_query(
   291        self,
   292        bq: bigquery_tools.BigQueryWrapper,
   293        element: 'ReadFromBigQueryRequest'):
   294      query_job_name = bigquery_tools.generate_bq_job_name(
   295          self._job_name,
   296          self._source_uuid,
   297          bigquery_tools.BigQueryJobTypes.QUERY,
   298          '%s_%s' % (int(time.time()), random.randint(0, 1000)))
   299      job = bq._start_query_job(
   300          self._get_project(),
   301          element.query,
   302          not element.use_standard_sql,
   303          element.flatten_results,
   304          job_id=query_job_name,
   305          priority=self.query_priority,
   306          kms_key=self.kms_key,
   307          job_labels=self._get_bq_metadata().add_additional_bq_job_labels(
   308              self.bigquery_job_labels))
   309      job_ref = job.jobReference
   310      bq.wait_for_bq_job(job_ref, max_retries=0)
   311      return bq._get_temp_table(self._get_project())
   312  
   313    def _export_files(
   314        self,
   315        bq: bigquery_tools.BigQueryWrapper,
   316        element: 'ReadFromBigQueryRequest',
   317        table_reference: TableReference):
   318      """Runs a BigQuery export job.
   319  
   320      Returns:
   321        bigquery.TableSchema instance, a list of FileMetadata instances
   322      """
   323      job_labels = self._get_bq_metadata().add_additional_bq_job_labels(
   324          self.bigquery_job_labels)
   325      export_job_name = bigquery_tools.generate_bq_job_name(
   326          self._job_name,
   327          self._source_uuid,
   328          bigquery_tools.BigQueryJobTypes.EXPORT,
   329          element.obj_id)
   330      temp_location = self.options.view_as(GoogleCloudOptions).temp_location
   331      gcs_location = bigquery_export_destination_uri(
   332          self.gcs_location,
   333          temp_location,
   334          '%s%s' % (self._source_uuid, element.obj_id))
   335      try:
   336        if self.use_json_exports:
   337          job_ref = bq.perform_extract_job([gcs_location],
   338                                           export_job_name,
   339                                           table_reference,
   340                                           bigquery_tools.FileFormat.JSON,
   341                                           project=self._get_project(),
   342                                           job_labels=job_labels,
   343                                           include_header=False)
   344        else:
   345          job_ref = bq.perform_extract_job([gcs_location],
   346                                           export_job_name,
   347                                           table_reference,
   348                                           bigquery_tools.FileFormat.AVRO,
   349                                           project=self._get_project(),
   350                                           include_header=False,
   351                                           job_labels=job_labels,
   352                                           use_avro_logical_types=True)
   353        bq.wait_for_bq_job(job_ref)
   354      except Exception as exn:  # pylint: disable=broad-except
   355        # The error messages thrown in this case are generic and misleading,
   356        # so leave this breadcrumb in case it's the root cause.
   357        logging.warning(
   358            "Error exporting table: %s. "
   359            "Note that external tables cannot be exported: "
   360            "https://cloud.google.com/bigquery/docs/external-tables"
   361            "#external_table_limitations",
   362            exn)
   363        raise
   364      metadata_list = FileSystems.match([gcs_location])[0].metadata_list
   365  
   366      if isinstance(table_reference, ValueProvider):
   367        table_ref = bigquery_tools.parse_table_reference(
   368            element.table, project=self._get_project())
   369      else:
   370        table_ref = table_reference
   371      table = bq.get_table(
   372          table_ref.projectId, table_ref.datasetId, table_ref.tableId)
   373  
   374      return table.schema, metadata_list
   375  
   376    def _get_project(self):
   377      """Returns the project that queries and exports will be billed to."""
   378  
   379      project = self.options.view_as(GoogleCloudOptions).project
   380      if isinstance(project, ValueProvider):
   381        project = project.get()
   382      if not project:
   383        project = self.project
   384      return project
   385  
   386  
   387  FieldSchema = collections.namedtuple('FieldSchema', 'fields mode name type')
   388  
   389  
   390  class _JsonToDictCoder(coders.Coder):
   391    """A coder for a JSON string to a Python dict."""
   392    def __init__(self, table_schema):
   393      self.fields = self._convert_to_tuple(table_schema.fields)
   394      self._converters = {
   395          'INTEGER': int,
   396          'INT64': int,
   397          'FLOAT': float,
   398          'FLOAT64': float,
   399          'NUMERIC': self._to_decimal,
   400          'BYTES': self._to_bytes,
   401      }
   402  
   403    @staticmethod
   404    def _to_decimal(value):
   405      return decimal.Decimal(value)
   406  
   407    @staticmethod
   408    def _to_bytes(value):
   409      """Converts value from str to bytes."""
   410      return value.encode('utf-8')
   411  
   412    @classmethod
   413    def _convert_to_tuple(cls, table_field_schemas):
   414      """Recursively converts the list of TableFieldSchema instances to the
   415      list of tuples to prevent errors when pickling and unpickling
   416      TableFieldSchema instances.
   417      """
   418      if not table_field_schemas:
   419        return []
   420  
   421      return [
   422          FieldSchema(cls._convert_to_tuple(x.fields), x.mode, x.name, x.type)
   423          for x in table_field_schemas
   424      ]
   425  
   426    def decode(self, value):
   427      value = json.loads(value.decode('utf-8'))
   428      return self._decode_row(value, self.fields)
   429  
   430    def _decode_row(self, row: Dict[str, Any], schema_fields: List[FieldSchema]):
   431      for field in schema_fields:
   432        if field.name not in row:
   433          # The field exists in the schema, but it doesn't exist in this row.
   434          # It probably means its value was null, as the extract to JSON job
   435          # doesn't preserve null fields
   436          row[field.name] = None
   437          continue
   438  
   439        if field.mode == 'REPEATED':
   440          for i, elem in enumerate(row[field.name]):
   441            row[field.name][i] = self._decode_data(elem, field)
   442        else:
   443          row[field.name] = self._decode_data(row[field.name], field)
   444      return row
   445  
   446    def _decode_data(self, obj: Any, field: FieldSchema):
   447      if not field.fields:
   448        try:
   449          return self._converters[field.type](obj)
   450        except KeyError:
   451          # No need to do any conversion
   452          return obj
   453      return self._decode_row(obj, field.fields)
   454  
   455    def is_deterministic(self):
   456      return True
   457  
   458    def to_type_hint(self):
   459      return dict