github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_write_perf_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_write_perf_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  A pipeline that writes data from Synthetic Source to a BigQuery table.
    20  Besides of the standard options, there are options with special meaning:
    21  * output_dataset - BQ dataset name.
    22  * output_table - BQ table name. The table will be removed after test completion,
    23  * input_options - options for Synthetic Source:
    24  num_records - number of rows to be inserted,
    25  value_size - the length of a single row,
    26  key_size - required option, but its value has no meaning.
    27  
    28  Example test run on DataflowRunner:
    29  
    30  python -m apache_beam.io.gcp.bigquery_write_perf_test \
    31      --test-pipeline-options="
    32      --runner=TestDataflowRunner
    33      --project=...
    34      --region=...
    35      --staging_location=gs://...
    36      --temp_location=gs://...
    37      --sdk_location=.../dist/apache-beam-x.x.x.dev0.tar.gz
    38      --publish_to_big_query=true
    39      --metrics_dataset=gs://...
    40      --metrics_table=...
    41      --output_dataset=...
    42      --output_table=...
    43      --input_options='{
    44      \"num_records\": 1024,
    45      \"key_size\": 1,
    46      \"value_size\": 1024,
    47      }'"
    48  
    49  This setup will result in a table of 1MB size.
    50  """
    51  
    52  # pytype: skip-file
    53  
    54  import logging
    55  
    56  from apache_beam import Map
    57  from apache_beam import ParDo
    58  from apache_beam.io import BigQueryDisposition
    59  from apache_beam.io import Read
    60  from apache_beam.io import WriteToBigQuery
    61  from apache_beam.io.gcp.bigquery_tools import parse_table_schema_from_json
    62  from apache_beam.io.gcp.tests import utils
    63  from apache_beam.testing.load_tests.load_test import LoadTest
    64  from apache_beam.testing.load_tests.load_test_metrics_utils import CountMessages
    65  from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureTime
    66  from apache_beam.testing.synthetic_pipeline import SyntheticSource
    67  
    68  
    69  class BigQueryWritePerfTest(LoadTest):
    70    def __init__(self):
    71      super().__init__()
    72      self.output_dataset = self.pipeline.get_option('output_dataset')
    73      self.output_table = self.pipeline.get_option('output_table')
    74  
    75    def test(self):
    76      SCHEMA = parse_table_schema_from_json(
    77          '{"fields": [{"name": "data", "type": "BYTES"}]}')
    78  
    79      def format_record(record):
    80        # Since Synthetic Source returns data as a dictionary, we should skip one
    81        # of the part
    82        import base64
    83        return {'data': base64.b64encode(record[1])}
    84  
    85      (  # pylint: disable=expression-not-assigned
    86          self.pipeline
    87          | 'Produce rows' >> Read(
    88              SyntheticSource(self.parse_synthetic_source_options()))
    89          | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
    90          | 'Format' >> Map(format_record)
    91          | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
    92          | 'Write to BigQuery' >> WriteToBigQuery(
    93              dataset=self.output_dataset,
    94              table=self.output_table,
    95              schema=SCHEMA,
    96              create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
    97              write_disposition=BigQueryDisposition.WRITE_TRUNCATE))
    98  
    99    def cleanup(self):
   100      """Removes an output BQ table."""
   101      utils.delete_bq_table(
   102          self.project_id, self.output_dataset, self.output_table)
   103  
   104  
   105  if __name__ == '__main__':
   106    logging.basicConfig(level=logging.INFO)
   107    BigQueryWritePerfTest().run()