github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_avro_tools.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Tools used tool work with Avro files in the context of BigQuery.
    19  
    20  Classes, constants and functions in this file are experimental and have no
    21  backwards compatibility guarantees.
    22  
    23  NOTHING IN THIS FILE HAS BACKWARDS COMPATIBILITY GUARANTEES.
    24  """
    25  
    26  # BigQuery types as listed in
    27  # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
    28  # with aliases (RECORD, BOOLEAN, FLOAT, INTEGER) as defined in
    29  # https://developers.google.com/resources/api-libraries/documentation/bigquery/v2/java/latest/com/google/api/services/bigquery/model/TableFieldSchema.html#setType-java.lang.String-
    30  BIG_QUERY_TO_AVRO_TYPES = {
    31      "STRUCT": "record",
    32      "RECORD": "record",
    33      "STRING": "string",
    34      "BOOL": "boolean",
    35      "BOOLEAN": "boolean",
    36      "BYTES": "bytes",
    37      "FLOAT64": "double",
    38      "FLOAT": "double",
    39      "INT64": "long",
    40      "INTEGER": "long",
    41      "TIME": {
    42          "type": "long",
    43          "logicalType": "time-micros",
    44      },
    45      "TIMESTAMP": {
    46          "type": "long",
    47          "logicalType": "timestamp-micros",
    48      },
    49      "DATE": {
    50          "type": "int",
    51          "logicalType": "date",
    52      },
    53      "DATETIME": "string",
    54      "NUMERIC": {
    55          "type": "bytes",
    56          "logicalType": "decimal",
    57          # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type
    58          "precision": 38,
    59          "scale": 9,
    60      },
    61      "GEOGRAPHY": "string",
    62  }
    63  
    64  
    65  def get_record_schema_from_dict_table_schema(
    66      schema_name, table_schema, namespace="apache_beam.io.gcp.bigquery"):
    67    # type: (Text, Dict[Text, Any], Text) -> Dict[Text, Any] # noqa: F821
    68  
    69    """Convert a table schema into an Avro schema.
    70  
    71    Args:
    72      schema_name (Text): The name of the record.
    73      table_schema (Dict[Text, Any]): A BigQuery table schema in dict form.
    74      namespace (Text): The namespace of the Avro schema.
    75  
    76    Returns:
    77      Dict[Text, Any]: The schema as an Avro RecordSchema.
    78    """
    79    avro_fields = [
    80        table_field_to_avro_field(field, ".".join((namespace, schema_name)))
    81        for field in table_schema["fields"]
    82    ]
    83  
    84    return {
    85        "type": "record",
    86        "name": schema_name,
    87        "fields": avro_fields,
    88        "doc": "Translated Avro Schema for {}".format(schema_name),
    89        "namespace": namespace,
    90    }
    91  
    92  
    93  def table_field_to_avro_field(table_field, namespace):
    94    # type: (Dict[Text, Any], str) -> Dict[Text, Any] # noqa: F821
    95  
    96    """Convert a BigQuery field to an avro field.
    97  
    98    Args:
    99      table_field (Dict[Text, Any]): A BigQuery field in dict form.
   100  
   101    Returns:
   102      Dict[Text, Any]: An equivalent Avro field in dict form.
   103    """
   104    assert "type" in table_field, \
   105      "Unable to get type for table field {}".format(table_field)
   106    assert table_field["type"] in BIG_QUERY_TO_AVRO_TYPES, \
   107      "Unable to map BigQuery field type {} to avro type".format(
   108        table_field["type"])
   109  
   110    avro_type = BIG_QUERY_TO_AVRO_TYPES[table_field["type"]]
   111  
   112    if avro_type == "record":
   113      element_type = get_record_schema_from_dict_table_schema(
   114          table_field["name"],
   115          table_field,
   116          namespace=".".join((namespace, table_field["name"])))
   117    else:
   118      element_type = avro_type
   119  
   120    field_mode = table_field.get("mode", "NULLABLE")
   121  
   122    if field_mode in (None, "NULLABLE"):
   123      field_type = ["null", element_type]
   124    elif field_mode == "REQUIRED":
   125      field_type = element_type
   126    elif field_mode == "REPEATED":
   127      field_type = {"type": "array", "items": element_type}
   128    else:
   129      raise ValueError("Unkown BigQuery field mode: {}".format(field_mode))
   130  
   131    avro_field = {"type": field_type, "name": table_field["name"]}
   132  
   133    doc = table_field.get("description")
   134    if doc:
   135      avro_field["doc"] = doc
   136  
   137    return avro_field