github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_io_read_pipeline.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A Dataflow job that counts the number of rows in a BQ table. 19 20 Can be configured to simulate slow reading for a given number of rows. 21 """ 22 23 # pytype: skip-file 24 25 import argparse 26 import logging 27 import random 28 import time 29 30 import apache_beam as beam 31 from apache_beam.io.gcp.bigquery import ReadFromBigQuery 32 from apache_beam.options.pipeline_options import GoogleCloudOptions 33 from apache_beam.options.pipeline_options import PipelineOptions 34 from apache_beam.testing.test_pipeline import TestPipeline 35 from apache_beam.testing.util import assert_that 36 from apache_beam.testing.util import equal_to 37 38 39 class RowToStringWithSlowDown(beam.DoFn): 40 def process(self, element, num_slow=0, *args, **kwargs): 41 42 if num_slow == 0: 43 yield ['row'] 44 else: 45 rand = random.random() * 100 46 if rand < num_slow: 47 time.sleep(0.01) 48 yield ['slow_row'] 49 else: 50 yield ['row'] 51 52 53 def run(argv=None): 54 parser = argparse.ArgumentParser() 55 parser.add_argument( 56 '--input_table', required=True, help='Input table to process.') 57 parser.add_argument( 58 '--num_records', 59 required=True, 60 help='The expected number of records', 61 type=int) 62 parser.add_argument( 63 '--num_slow', 64 default=0, 65 help=( 66 'Percentage of rows that will be slow. ' 67 'Must be in the range [0, 100)')) 68 parser.add_argument( 69 '--beam_bq_source', 70 default=False, 71 type=bool, 72 help=( 73 'Whether to use the new ReadFromBigQuery' 74 ' transform, or the BigQuerySource.')) 75 known_args, pipeline_args = parser.parse_known_args(argv) 76 77 options = PipelineOptions(pipeline_args) 78 with TestPipeline(options=options) as p: 79 if known_args.beam_bq_source: 80 reader = ReadFromBigQuery( 81 table='%s:%s' % 82 (options.view_as(GoogleCloudOptions).project, known_args.input_table)) 83 else: 84 reader = beam.io.Read(beam.io.BigQuerySource(known_args.input_table)) 85 86 # pylint: disable=expression-not-assigned 87 count = ( 88 p | 'read' >> reader 89 | 'row to string' >> beam.ParDo( 90 RowToStringWithSlowDown(), num_slow=known_args.num_slow) 91 | 'count' >> beam.combiners.Count.Globally()) 92 93 assert_that(count, equal_to([known_args.num_records])) 94 95 96 if __name__ == '__main__': 97 logging.getLogger().setLevel(logging.INFO) 98 run()