github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_read_perf_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """ 19 A performance test for reading data from a BigQuery table. 20 Besides of the standard options, there are options with special meaning: 21 * input_dataset - BQ dataset id. 22 * input_table - BQ table id. 23 The table will be created and populated with data from Synthetic Source if it 24 does not exist. 25 * input_options - options for Synthetic Source: 26 num_records - number of rows to be inserted, 27 value_size - the length of a single row, 28 key_size - required option, but its value has no meaning. 29 30 Example test run on DataflowRunner: 31 32 python -m apache_beam.io.gcp.bigquery_read_perf_test \ 33 --test-pipeline-options=" 34 --runner=TestDataflowRunner 35 --project=... 36 --region=... 37 --staging_location=gs://... 38 --temp_location=gs://... 39 --sdk_location=.../dist/apache-beam-x.x.x.dev0.tar.gz 40 --publish_to_big_query=true 41 --metrics_dataset=gs://... 42 --metrics_table=... 43 --input_dataset=... 44 --input_table=... 45 --input_options='{ 46 \"num_records\": 1024, 47 \"key_size\": 1, 48 \"value_size\": 1024, 49 }'" 50 """ 51 52 # pytype: skip-file 53 54 import logging 55 56 from apache_beam import Map 57 from apache_beam import ParDo 58 from apache_beam.io import BigQueryDisposition 59 from apache_beam.io import BigQuerySource 60 from apache_beam.io import Read 61 from apache_beam.io import WriteToBigQuery 62 from apache_beam.io.gcp.bigquery_tools import BigQueryWrapper 63 from apache_beam.io.gcp.bigquery_tools import parse_table_schema_from_json 64 from apache_beam.testing.load_tests.load_test import LoadTest 65 from apache_beam.testing.load_tests.load_test_metrics_utils import CountMessages 66 from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureTime 67 from apache_beam.testing.synthetic_pipeline import SyntheticSource 68 from apache_beam.testing.test_pipeline import TestPipeline 69 from apache_beam.testing.util import assert_that 70 from apache_beam.testing.util import equal_to 71 from apache_beam.transforms.combiners import Count 72 73 # pylint: disable=wrong-import-order, wrong-import-position 74 try: 75 from apitools.base.py.exceptions import HttpError 76 except ImportError: 77 HttpError = None 78 # pylint: enable=wrong-import-order, wrong-import-position 79 80 81 class BigQueryReadPerfTest(LoadTest): 82 def __init__(self): 83 super().__init__() 84 self.input_dataset = self.pipeline.get_option('input_dataset') 85 self.input_table = self.pipeline.get_option('input_table') 86 self._check_for_input_data() 87 88 def _check_for_input_data(self): 89 """Checks if a BQ table with input data exists and creates it if not.""" 90 wrapper = BigQueryWrapper() 91 try: 92 wrapper.get_table(self.project_id, self.input_dataset, self.input_table) 93 except HttpError as exn: 94 if exn.status_code == 404: 95 self._create_input_data() 96 97 def _create_input_data(self): 98 """ 99 Runs an additional pipeline which creates test data and waits for its 100 completion. 101 """ 102 SCHEMA = parse_table_schema_from_json( 103 '{"fields": [{"name": "data", "type": "BYTES"}]}') 104 105 def format_record(record): 106 # Since Synthetic Source returns data as a dictionary, we should skip one 107 # of the part 108 import base64 109 return {'data': base64.b64encode(record[1])} 110 111 with TestPipeline() as p: 112 ( # pylint: disable=expression-not-assigned 113 p 114 | 'Produce rows' >> Read( 115 SyntheticSource(self.parse_synthetic_source_options())) 116 | 'Format' >> Map(format_record) 117 | 'Write to BigQuery' >> WriteToBigQuery( 118 dataset=self.input_dataset, 119 table=self.input_table, 120 schema=SCHEMA, 121 create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, 122 write_disposition=BigQueryDisposition.WRITE_EMPTY)) 123 124 def test(self): 125 output = ( 126 self.pipeline 127 | 'Read from BigQuery' >> Read( 128 BigQuerySource(dataset=self.input_dataset, table=self.input_table)) 129 | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) 130 | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) 131 | 'Count' >> Count.Globally()) 132 assert_that(output, equal_to([self.input_options['num_records']])) 133 134 135 if __name__ == '__main__': 136 logging.basicConfig(level=logging.INFO) 137 BigQueryReadPerfTest().run()