github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_avro_tools.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Tools used tool work with Avro files in the context of BigQuery. 19 20 Classes, constants and functions in this file are experimental and have no 21 backwards compatibility guarantees. 22 23 NOTHING IN THIS FILE HAS BACKWARDS COMPATIBILITY GUARANTEES. 24 """ 25 26 # BigQuery types as listed in 27 # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types 28 # with aliases (RECORD, BOOLEAN, FLOAT, INTEGER) as defined in 29 # https://developers.google.com/resources/api-libraries/documentation/bigquery/v2/java/latest/com/google/api/services/bigquery/model/TableFieldSchema.html#setType-java.lang.String- 30 BIG_QUERY_TO_AVRO_TYPES = { 31 "STRUCT": "record", 32 "RECORD": "record", 33 "STRING": "string", 34 "BOOL": "boolean", 35 "BOOLEAN": "boolean", 36 "BYTES": "bytes", 37 "FLOAT64": "double", 38 "FLOAT": "double", 39 "INT64": "long", 40 "INTEGER": "long", 41 "TIME": { 42 "type": "long", 43 "logicalType": "time-micros", 44 }, 45 "TIMESTAMP": { 46 "type": "long", 47 "logicalType": "timestamp-micros", 48 }, 49 "DATE": { 50 "type": "int", 51 "logicalType": "date", 52 }, 53 "DATETIME": "string", 54 "NUMERIC": { 55 "type": "bytes", 56 "logicalType": "decimal", 57 # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type 58 "precision": 38, 59 "scale": 9, 60 }, 61 "GEOGRAPHY": "string", 62 } 63 64 65 def get_record_schema_from_dict_table_schema( 66 schema_name, table_schema, namespace="apache_beam.io.gcp.bigquery"): 67 # type: (Text, Dict[Text, Any], Text) -> Dict[Text, Any] # noqa: F821 68 69 """Convert a table schema into an Avro schema. 70 71 Args: 72 schema_name (Text): The name of the record. 73 table_schema (Dict[Text, Any]): A BigQuery table schema in dict form. 74 namespace (Text): The namespace of the Avro schema. 75 76 Returns: 77 Dict[Text, Any]: The schema as an Avro RecordSchema. 78 """ 79 avro_fields = [ 80 table_field_to_avro_field(field, ".".join((namespace, schema_name))) 81 for field in table_schema["fields"] 82 ] 83 84 return { 85 "type": "record", 86 "name": schema_name, 87 "fields": avro_fields, 88 "doc": "Translated Avro Schema for {}".format(schema_name), 89 "namespace": namespace, 90 } 91 92 93 def table_field_to_avro_field(table_field, namespace): 94 # type: (Dict[Text, Any], str) -> Dict[Text, Any] # noqa: F821 95 96 """Convert a BigQuery field to an avro field. 97 98 Args: 99 table_field (Dict[Text, Any]): A BigQuery field in dict form. 100 101 Returns: 102 Dict[Text, Any]: An equivalent Avro field in dict form. 103 """ 104 assert "type" in table_field, \ 105 "Unable to get type for table field {}".format(table_field) 106 assert table_field["type"] in BIG_QUERY_TO_AVRO_TYPES, \ 107 "Unable to map BigQuery field type {} to avro type".format( 108 table_field["type"]) 109 110 avro_type = BIG_QUERY_TO_AVRO_TYPES[table_field["type"]] 111 112 if avro_type == "record": 113 element_type = get_record_schema_from_dict_table_schema( 114 table_field["name"], 115 table_field, 116 namespace=".".join((namespace, table_field["name"]))) 117 else: 118 element_type = avro_type 119 120 field_mode = table_field.get("mode", "NULLABLE") 121 122 if field_mode in (None, "NULLABLE"): 123 field_type = ["null", element_type] 124 elif field_mode == "REQUIRED": 125 field_type = element_type 126 elif field_mode == "REPEATED": 127 field_type = {"type": "array", "items": element_type} 128 else: 129 raise ValueError("Unkown BigQuery field mode: {}".format(field_mode)) 130 131 avro_field = {"type": field_type, "name": table_field["name"]} 132 133 doc = table_field.get("description") 134 if doc: 135 avro_field["doc"] = doc 136 137 return avro_field