github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/bigquery_schema.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A workflow that writes to a BigQuery table with nested and repeated fields. 19 20 Demonstrates how to build a bigquery.TableSchema object with nested and repeated 21 fields. Also, shows how to generate data to be written to a BigQuery table with 22 nested and repeated fields. 23 """ 24 25 # pytype: skip-file 26 27 import argparse 28 import logging 29 30 import apache_beam as beam 31 32 33 def run(argv=None): 34 """Run the workflow.""" 35 parser = argparse.ArgumentParser() 36 37 parser.add_argument( 38 '--output', 39 required=True, 40 help=( 41 'Output BigQuery table for results specified as: ' 42 'PROJECT:DATASET.TABLE or DATASET.TABLE.')) 43 known_args, pipeline_args = parser.parse_known_args(argv) 44 45 with beam.Pipeline(argv=pipeline_args) as p: 46 47 from apache_beam.io.gcp.internal.clients import bigquery # pylint: disable=wrong-import-order, wrong-import-position 48 49 table_schema = bigquery.TableSchema() 50 51 # Fields that use standard types. 52 kind_schema = bigquery.TableFieldSchema() 53 kind_schema.name = 'kind' 54 kind_schema.type = 'string' 55 kind_schema.mode = 'nullable' 56 table_schema.fields.append(kind_schema) 57 58 full_name_schema = bigquery.TableFieldSchema() 59 full_name_schema.name = 'fullName' 60 full_name_schema.type = 'string' 61 full_name_schema.mode = 'required' 62 table_schema.fields.append(full_name_schema) 63 64 age_schema = bigquery.TableFieldSchema() 65 age_schema.name = 'age' 66 age_schema.type = 'integer' 67 age_schema.mode = 'nullable' 68 table_schema.fields.append(age_schema) 69 70 gender_schema = bigquery.TableFieldSchema() 71 gender_schema.name = 'gender' 72 gender_schema.type = 'string' 73 gender_schema.mode = 'nullable' 74 table_schema.fields.append(gender_schema) 75 76 # A nested field 77 phone_number_schema = bigquery.TableFieldSchema() 78 phone_number_schema.name = 'phoneNumber' 79 phone_number_schema.type = 'record' 80 phone_number_schema.mode = 'nullable' 81 82 area_code = bigquery.TableFieldSchema() 83 area_code.name = 'areaCode' 84 area_code.type = 'integer' 85 area_code.mode = 'nullable' 86 phone_number_schema.fields.append(area_code) 87 88 number = bigquery.TableFieldSchema() 89 number.name = 'number' 90 number.type = 'integer' 91 number.mode = 'nullable' 92 phone_number_schema.fields.append(number) 93 table_schema.fields.append(phone_number_schema) 94 95 # A repeated field. 96 children_schema = bigquery.TableFieldSchema() 97 children_schema.name = 'children' 98 children_schema.type = 'string' 99 children_schema.mode = 'repeated' 100 table_schema.fields.append(children_schema) 101 102 def create_random_record(record_id): 103 return { 104 'kind': 'kind' + record_id, 105 'fullName': 'fullName' + record_id, 106 'age': int(record_id) * 10, 107 'gender': 'male', 108 'phoneNumber': { 109 'areaCode': int(record_id) * 100, 110 'number': int(record_id) * 100000 111 }, 112 'children': [ 113 'child' + record_id + '1', 114 'child' + record_id + '2', 115 'child' + record_id + '3' 116 ] 117 } 118 119 # pylint: disable=expression-not-assigned 120 record_ids = p | 'CreateIDs' >> beam.Create(['1', '2', '3', '4', '5']) 121 records = record_ids | 'CreateRecords' >> beam.Map(create_random_record) 122 records | 'write' >> beam.io.WriteToBigQuery( 123 known_args.output, 124 schema=table_schema, 125 create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 126 write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) 127 128 # Run the pipeline (all operations are deferred until run() is called). 129 130 131 if __name__ == '__main__': 132 logging.getLogger().setLevel(logging.INFO) 133 run()