github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_io_metadata.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Metadata for use in BigQueryIO, i.e. a job_id to use in BQ job labels."""
    19  
    20  # pytype: skip-file
    21  
    22  import re
    23  
    24  from apache_beam.io.gcp import gce_metadata_util
    25  
    26  _VALID_CLOUD_LABEL_PATTERN = re.compile(r'^[a-z0-9\_\-]{1,63}$')
    27  
    28  
    29  def _sanitize_value(value):
    30    """Sanitizes a value into a valid BigQuery label value."""
    31    return re.sub(r'[^\w-]+', '', value.lower().replace('/', '-'))[0:63]
    32  
    33  
    34  def _is_valid_cloud_label_value(label_value):
    35    """Returns true if label_value is a valid cloud label string.
    36  
    37      This function can return false in cases where the label value is valid.
    38      However, it will not return true in a case where the lavel value is invalid.
    39      This is because a stricter set of allowed characters is used in this
    40      validator, because foreign language characters are not accepted.
    41      Thus, this should not be used as a generic validator for all cloud labels.
    42  
    43      See Also:
    44        https://cloud.google.com/compute/docs/labeling-resources
    45  
    46      Args:
    47        label_value: The label value to validate.
    48  
    49      Returns:
    50        True if the label value is a valid
    51    """
    52    return _VALID_CLOUD_LABEL_PATTERN.match(label_value)
    53  
    54  
    55  def create_bigquery_io_metadata(step_name=None):
    56    """Creates a BigQueryIOMetadata.
    57  
    58    This will request metadata properly based on which runner is being used.
    59    """
    60    dataflow_job_id = gce_metadata_util.fetch_dataflow_job_id()
    61    # If a dataflow_job id is returned on GCE metadata. Then it means
    62    # This program is running on a Dataflow GCE VM.
    63    is_dataflow_runner = bool(dataflow_job_id)
    64    kwargs = {}
    65    if is_dataflow_runner:
    66      # Only use this label if it is validated already.
    67      # As we do not want a bad label to fail the BQ job.
    68      if _is_valid_cloud_label_value(dataflow_job_id):
    69        kwargs['beam_job_id'] = dataflow_job_id
    70    if step_name:
    71      step_name = _sanitize_value(step_name)
    72      if _is_valid_cloud_label_value(step_name):
    73        kwargs['step_name'] = step_name
    74    return BigQueryIOMetadata(**kwargs)
    75  
    76  
    77  class BigQueryIOMetadata(object):
    78    """Metadata class for BigQueryIO. i.e. to use as BQ job labels.
    79  
    80    Do not construct directly, use the create_bigquery_io_metadata factory.
    81    Which will request metadata properly based on which runner is being used.
    82    """
    83    def __init__(self, beam_job_id=None, step_name=None):
    84      self.beam_job_id = beam_job_id
    85      self.step_name = step_name
    86  
    87    def add_additional_bq_job_labels(self, job_labels=None):
    88      job_labels = job_labels or {}
    89      if self.beam_job_id and 'beam_job_id' not in job_labels:
    90        job_labels['beam_job_id'] = self.beam_job_id
    91      if self.step_name and 'step_name' not in job_labels:
    92        job_labels['step_name'] = self.step_name
    93      return job_labels