github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/bigquery_schema.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/cookbook/bigquery_schema.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A workflow that writes to a BigQuery table with nested and repeated fields.
    19  
    20  Demonstrates how to build a bigquery.TableSchema object with nested and repeated
    21  fields. Also, shows how to generate data to be written to a BigQuery table with
    22  nested and repeated fields.
    23  """
    24  
    25  # pytype: skip-file
    26  
    27  import argparse
    28  import logging
    29  
    30  import apache_beam as beam
    31  
    32  
    33  def run(argv=None):
    34    """Run the workflow."""
    35    parser = argparse.ArgumentParser()
    36  
    37    parser.add_argument(
    38        '--output',
    39        required=True,
    40        help=(
    41            'Output BigQuery table for results specified as: '
    42            'PROJECT:DATASET.TABLE or DATASET.TABLE.'))
    43    known_args, pipeline_args = parser.parse_known_args(argv)
    44  
    45    with beam.Pipeline(argv=pipeline_args) as p:
    46  
    47      from apache_beam.io.gcp.internal.clients import bigquery  # pylint: disable=wrong-import-order, wrong-import-position
    48  
    49      table_schema = bigquery.TableSchema()
    50  
    51      # Fields that use standard types.
    52      kind_schema = bigquery.TableFieldSchema()
    53      kind_schema.name = 'kind'
    54      kind_schema.type = 'string'
    55      kind_schema.mode = 'nullable'
    56      table_schema.fields.append(kind_schema)
    57  
    58      full_name_schema = bigquery.TableFieldSchema()
    59      full_name_schema.name = 'fullName'
    60      full_name_schema.type = 'string'
    61      full_name_schema.mode = 'required'
    62      table_schema.fields.append(full_name_schema)
    63  
    64      age_schema = bigquery.TableFieldSchema()
    65      age_schema.name = 'age'
    66      age_schema.type = 'integer'
    67      age_schema.mode = 'nullable'
    68      table_schema.fields.append(age_schema)
    69  
    70      gender_schema = bigquery.TableFieldSchema()
    71      gender_schema.name = 'gender'
    72      gender_schema.type = 'string'
    73      gender_schema.mode = 'nullable'
    74      table_schema.fields.append(gender_schema)
    75  
    76      # A nested field
    77      phone_number_schema = bigquery.TableFieldSchema()
    78      phone_number_schema.name = 'phoneNumber'
    79      phone_number_schema.type = 'record'
    80      phone_number_schema.mode = 'nullable'
    81  
    82      area_code = bigquery.TableFieldSchema()
    83      area_code.name = 'areaCode'
    84      area_code.type = 'integer'
    85      area_code.mode = 'nullable'
    86      phone_number_schema.fields.append(area_code)
    87  
    88      number = bigquery.TableFieldSchema()
    89      number.name = 'number'
    90      number.type = 'integer'
    91      number.mode = 'nullable'
    92      phone_number_schema.fields.append(number)
    93      table_schema.fields.append(phone_number_schema)
    94  
    95      # A repeated field.
    96      children_schema = bigquery.TableFieldSchema()
    97      children_schema.name = 'children'
    98      children_schema.type = 'string'
    99      children_schema.mode = 'repeated'
   100      table_schema.fields.append(children_schema)
   101  
   102      def create_random_record(record_id):
   103        return {
   104            'kind': 'kind' + record_id,
   105            'fullName': 'fullName' + record_id,
   106            'age': int(record_id) * 10,
   107            'gender': 'male',
   108            'phoneNumber': {
   109                'areaCode': int(record_id) * 100,
   110                'number': int(record_id) * 100000
   111            },
   112            'children': [
   113                'child' + record_id + '1',
   114                'child' + record_id + '2',
   115                'child' + record_id + '3'
   116            ]
   117        }
   118  
   119      # pylint: disable=expression-not-assigned
   120      record_ids = p | 'CreateIDs' >> beam.Create(['1', '2', '3', '4', '5'])
   121      records = record_ids | 'CreateRecords' >> beam.Map(create_random_record)
   122      records | 'write' >> beam.io.WriteToBigQuery(
   123          known_args.output,
   124          schema=table_schema,
   125          create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
   126          write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
   127  
   128      # Run the pipeline (all operations are deferred until run() is called).
   129  
   130  
   131  if __name__ == '__main__':
   132    logging.getLogger().setLevel(logging.INFO)
   133    run()