github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/bigquery_io_read_pipeline.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A Dataflow job that counts the number of rows in a BQ table.
    19  
    20     Can be configured to simulate slow reading for a given number of rows.
    21  """
    22  
    23  # pytype: skip-file
    24  
    25  import argparse
    26  import logging
    27  import random
    28  import time
    29  
    30  import apache_beam as beam
    31  from apache_beam.io.gcp.bigquery import ReadFromBigQuery
    32  from apache_beam.options.pipeline_options import GoogleCloudOptions
    33  from apache_beam.options.pipeline_options import PipelineOptions
    34  from apache_beam.testing.test_pipeline import TestPipeline
    35  from apache_beam.testing.util import assert_that
    36  from apache_beam.testing.util import equal_to
    37  
    38  
    39  class RowToStringWithSlowDown(beam.DoFn):
    40    def process(self, element, num_slow=0, *args, **kwargs):
    41  
    42      if num_slow == 0:
    43        yield ['row']
    44      else:
    45        rand = random.random() * 100
    46        if rand < num_slow:
    47          time.sleep(0.01)
    48          yield ['slow_row']
    49        else:
    50          yield ['row']
    51  
    52  
    53  def run(argv=None):
    54    parser = argparse.ArgumentParser()
    55    parser.add_argument(
    56        '--input_table', required=True, help='Input table to process.')
    57    parser.add_argument(
    58        '--num_records',
    59        required=True,
    60        help='The expected number of records',
    61        type=int)
    62    parser.add_argument(
    63        '--num_slow',
    64        default=0,
    65        help=(
    66            'Percentage of rows that will be slow. '
    67            'Must be in the range [0, 100)'))
    68    parser.add_argument(
    69        '--beam_bq_source',
    70        default=False,
    71        type=bool,
    72        help=(
    73            'Whether to use the new ReadFromBigQuery'
    74            ' transform, or the BigQuerySource.'))
    75    known_args, pipeline_args = parser.parse_known_args(argv)
    76  
    77    options = PipelineOptions(pipeline_args)
    78    with TestPipeline(options=options) as p:
    79      if known_args.beam_bq_source:
    80        reader = ReadFromBigQuery(
    81            table='%s:%s' %
    82            (options.view_as(GoogleCloudOptions).project, known_args.input_table))
    83      else:
    84        reader = beam.io.Read(beam.io.BigQuerySource(known_args.input_table))
    85  
    86      # pylint: disable=expression-not-assigned
    87      count = (
    88          p | 'read' >> reader
    89          | 'row to string' >> beam.ParDo(
    90              RowToStringWithSlowDown(), num_slow=known_args.num_slow)
    91          | 'count' >> beam.combiners.Count.Globally())
    92  
    93      assert_that(count, equal_to([known_args.num_records]))
    94  
    95  
    96  if __name__ == '__main__':
    97    logging.getLogger().setLevel(logging.INFO)
    98    run()