github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_io_metadata.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Metadata for use in BigQueryIO, i.e. a job_id to use in BQ job labels.""" 19 20 # pytype: skip-file 21 22 import re 23 24 from apache_beam.io.gcp import gce_metadata_util 25 26 _VALID_CLOUD_LABEL_PATTERN = re.compile(r'^[a-z0-9\_\-]{1,63}$') 27 28 29 def _sanitize_value(value): 30 """Sanitizes a value into a valid BigQuery label value.""" 31 return re.sub(r'[^\w-]+', '', value.lower().replace('/', '-'))[0:63] 32 33 34 def _is_valid_cloud_label_value(label_value): 35 """Returns true if label_value is a valid cloud label string. 36 37 This function can return false in cases where the label value is valid. 38 However, it will not return true in a case where the lavel value is invalid. 39 This is because a stricter set of allowed characters is used in this 40 validator, because foreign language characters are not accepted. 41 Thus, this should not be used as a generic validator for all cloud labels. 42 43 See Also: 44 https://cloud.google.com/compute/docs/labeling-resources 45 46 Args: 47 label_value: The label value to validate. 48 49 Returns: 50 True if the label value is a valid 51 """ 52 return _VALID_CLOUD_LABEL_PATTERN.match(label_value) 53 54 55 def create_bigquery_io_metadata(step_name=None): 56 """Creates a BigQueryIOMetadata. 57 58 This will request metadata properly based on which runner is being used. 59 """ 60 dataflow_job_id = gce_metadata_util.fetch_dataflow_job_id() 61 # If a dataflow_job id is returned on GCE metadata. Then it means 62 # This program is running on a Dataflow GCE VM. 63 is_dataflow_runner = bool(dataflow_job_id) 64 kwargs = {} 65 if is_dataflow_runner: 66 # Only use this label if it is validated already. 67 # As we do not want a bad label to fail the BQ job. 68 if _is_valid_cloud_label_value(dataflow_job_id): 69 kwargs['beam_job_id'] = dataflow_job_id 70 if step_name: 71 step_name = _sanitize_value(step_name) 72 if _is_valid_cloud_label_value(step_name): 73 kwargs['step_name'] = step_name 74 return BigQueryIOMetadata(**kwargs) 75 76 77 class BigQueryIOMetadata(object): 78 """Metadata class for BigQueryIO. i.e. to use as BQ job labels. 79 80 Do not construct directly, use the create_bigquery_io_metadata factory. 81 Which will request metadata properly based on which runner is being used. 82 """ 83 def __init__(self, beam_job_id=None, step_name=None): 84 self.beam_job_id = beam_job_id 85 self.step_name = step_name 86 87 def add_additional_bq_job_labels(self, job_labels=None): 88 job_labels = job_labels or {} 89 if self.beam_job_id and 'beam_job_id' not in job_labels: 90 job_labels['beam_job_id'] = self.beam_job_id 91 if self.step_name and 'step_name' not in job_labels: 92 job_labels['step_name'] = self.step_name 93 return job_labels