github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_read_perf_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_read_perf_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  A performance test for reading data from a BigQuery table.
    20  Besides of the standard options, there are options with special meaning:
    21  * input_dataset - BQ dataset id.
    22  * input_table - BQ table id.
    23  The table will be created and populated with data from Synthetic Source if it
    24  does not exist.
    25  * input_options - options for Synthetic Source:
    26  num_records - number of rows to be inserted,
    27  value_size - the length of a single row,
    28  key_size - required option, but its value has no meaning.
    29  
    30  Example test run on DataflowRunner:
    31  
    32  python -m apache_beam.io.gcp.bigquery_read_perf_test \
    33      --test-pipeline-options="
    34      --runner=TestDataflowRunner
    35      --project=...
    36      --region=...
    37      --staging_location=gs://...
    38      --temp_location=gs://...
    39      --sdk_location=.../dist/apache-beam-x.x.x.dev0.tar.gz
    40      --publish_to_big_query=true
    41      --metrics_dataset=gs://...
    42      --metrics_table=...
    43      --input_dataset=...
    44      --input_table=...
    45      --input_options='{
    46      \"num_records\": 1024,
    47      \"key_size\": 1,
    48      \"value_size\": 1024,
    49      }'"
    50  """
    51  
    52  # pytype: skip-file
    53  
    54  import logging
    55  
    56  from apache_beam import Map
    57  from apache_beam import ParDo
    58  from apache_beam.io import BigQueryDisposition
    59  from apache_beam.io import BigQuerySource
    60  from apache_beam.io import Read
    61  from apache_beam.io import WriteToBigQuery
    62  from apache_beam.io.gcp.bigquery_tools import BigQueryWrapper
    63  from apache_beam.io.gcp.bigquery_tools import parse_table_schema_from_json
    64  from apache_beam.testing.load_tests.load_test import LoadTest
    65  from apache_beam.testing.load_tests.load_test_metrics_utils import CountMessages
    66  from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureTime
    67  from apache_beam.testing.synthetic_pipeline import SyntheticSource
    68  from apache_beam.testing.test_pipeline import TestPipeline
    69  from apache_beam.testing.util import assert_that
    70  from apache_beam.testing.util import equal_to
    71  from apache_beam.transforms.combiners import Count
    72  
    73  # pylint: disable=wrong-import-order, wrong-import-position
    74  try:
    75    from apitools.base.py.exceptions import HttpError
    76  except ImportError:
    77    HttpError = None
    78  # pylint: enable=wrong-import-order, wrong-import-position
    79  
    80  
    81  class BigQueryReadPerfTest(LoadTest):
    82    def __init__(self):
    83      super().__init__()
    84      self.input_dataset = self.pipeline.get_option('input_dataset')
    85      self.input_table = self.pipeline.get_option('input_table')
    86      self._check_for_input_data()
    87  
    88    def _check_for_input_data(self):
    89      """Checks if a BQ table with input data exists and creates it if not."""
    90      wrapper = BigQueryWrapper()
    91      try:
    92        wrapper.get_table(self.project_id, self.input_dataset, self.input_table)
    93      except HttpError as exn:
    94        if exn.status_code == 404:
    95          self._create_input_data()
    96  
    97    def _create_input_data(self):
    98      """
    99      Runs an additional pipeline which creates test data and waits for its
   100      completion.
   101      """
   102      SCHEMA = parse_table_schema_from_json(
   103          '{"fields": [{"name": "data", "type": "BYTES"}]}')
   104  
   105      def format_record(record):
   106        # Since Synthetic Source returns data as a dictionary, we should skip one
   107        # of the part
   108        import base64
   109        return {'data': base64.b64encode(record[1])}
   110  
   111      with TestPipeline() as p:
   112        (  # pylint: disable=expression-not-assigned
   113            p
   114            | 'Produce rows' >> Read(
   115                SyntheticSource(self.parse_synthetic_source_options()))
   116            | 'Format' >> Map(format_record)
   117            | 'Write to BigQuery' >> WriteToBigQuery(
   118                dataset=self.input_dataset,
   119                table=self.input_table,
   120                schema=SCHEMA,
   121                create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
   122                write_disposition=BigQueryDisposition.WRITE_EMPTY))
   123  
   124    def test(self):
   125      output = (
   126          self.pipeline
   127          | 'Read from BigQuery' >> Read(
   128              BigQuerySource(dataset=self.input_dataset, table=self.input_table))
   129          | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
   130          | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
   131          | 'Count' >> Count.Globally())
   132      assert_that(output, equal_to([self.input_options['num_records']]))
   133  
   134  
   135  if __name__ == '__main__':
   136    logging.basicConfig(level=logging.INFO)
   137    BigQueryReadPerfTest().run()