github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filebasedio_perf_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Performance tests for file based io connectors."""
    19  
    20  import logging
    21  import sys
    22  import uuid
    23  from typing import Tuple
    24  
    25  import apache_beam as beam
    26  from apache_beam import typehints
    27  from apache_beam.io.filesystems import FileSystems
    28  from apache_beam.io.iobase import Read
    29  from apache_beam.io.textio import ReadFromText
    30  from apache_beam.io.textio import WriteToText
    31  from apache_beam.testing.load_tests.load_test import LoadTest
    32  from apache_beam.testing.load_tests.load_test import LoadTestOptions
    33  from apache_beam.testing.load_tests.load_test_metrics_utils import CountMessages
    34  from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureTime
    35  from apache_beam.testing.synthetic_pipeline import SyntheticSource
    36  from apache_beam.testing.test_pipeline import TestPipeline
    37  from apache_beam.testing.util import assert_that
    38  from apache_beam.testing.util import equal_to
    39  from apache_beam.transforms.util import Reshuffle
    40  
    41  WRITE_NAMESPACE = 'write'
    42  READ_NAMESPACE = 'read'
    43  
    44  _LOGGER = logging.getLogger(__name__)
    45  
    46  
    47  class FileBasedIOTestOptions(LoadTestOptions):
    48    @classmethod
    49    def _add_argparse_args(cls, parser):
    50      parser.add_argument(
    51          '--test_class', required=True, help='Test class to run.')
    52      parser.add_argument(
    53          '--filename_prefix',
    54          required=True,
    55          help='Destination prefix for files generated by the test.')
    56      parser.add_argument(
    57          '--compression_type',
    58          default='auto',
    59          help='File compression type for writing and reading test files.')
    60      parser.add_argument(
    61          '--number_of_shards',
    62          type=int,
    63          default=0,
    64          help='Number of files this test will create during the write phase.')
    65      parser.add_argument(
    66          '--dataset_size',
    67          type=int,
    68          help='Size of data saved on the target filesystem (bytes).')
    69  
    70  
    71  @typehints.with_output_types(bytes)
    72  @typehints.with_input_types(Tuple[bytes, bytes])
    73  class SyntheticRecordToStrFn(beam.DoFn):
    74    """
    75    A DoFn that convert key-value bytes from synthetic source to string record.
    76  
    77    It uses base64 to convert random bytes emitted from the synthetic source.
    78    Therefore, every 3 bytes give 4 bytes long ascii characters.
    79  
    80    Output length = 4(ceil[len(key)/3] + ceil[len(value)/3]) + 1
    81    """
    82    def process(self, element):
    83      import base64
    84      yield base64.b64encode(element[0]) + b' ' + base64.b64encode(element[1])
    85  
    86  
    87  class CreateFolderFn(beam.DoFn):
    88    """Create folder at pipeline runtime."""
    89    def __init__(self, folder):
    90      self.folder = folder
    91  
    92    def process(self, element):
    93      from apache_beam.io.filesystems import FileSystems  # pylint: disable=reimported
    94      filesystem = FileSystems.get_filesystem(self.folder)
    95      if filesystem.has_dirs() and not filesystem.exists(self.folder):
    96        filesystem.mkdirs(self.folder)
    97  
    98  
    99  class TextIOPerfTest:
   100    def run(self):
   101      write_test = _TextIOWritePerfTest(need_cleanup=False)
   102      read_test = _TextIOReadPerfTest(input_folder=write_test.output_folder)
   103      write_test.run()
   104      read_test.run()
   105  
   106  
   107  class _TextIOWritePerfTest(LoadTest):
   108    def __init__(self, need_cleanup=True):
   109      super().__init__(WRITE_NAMESPACE)
   110      self.need_cleanup = need_cleanup
   111      self.test_options = self.pipeline.get_pipeline_options().view_as(
   112          FileBasedIOTestOptions)
   113      self.output_folder = FileSystems.join(
   114          self.test_options.filename_prefix, str(uuid.uuid4()))
   115  
   116    def test(self):
   117      # first makedir if needed
   118      _ = (
   119          self.pipeline
   120          | beam.Impulse()
   121          | beam.ParDo(CreateFolderFn(self.output_folder)))
   122  
   123      # write to text
   124      _ = (
   125          self.pipeline
   126          | 'Produce rows' >> Read(
   127              SyntheticSource(self.parse_synthetic_source_options()))
   128          | 'Count records' >> beam.ParDo(CountMessages(self.metrics_namespace))
   129          | 'Format' >> beam.ParDo(SyntheticRecordToStrFn())
   130          | 'Avoid Fusion' >> Reshuffle()
   131          | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))
   132          | 'Write Text' >> WriteToText(
   133              file_path_prefix=FileSystems.join(self.output_folder, 'test'),
   134              compression_type=self.test_options.compression_type,
   135              num_shards=self.test_options.number_of_shards))
   136  
   137    def cleanup(self):
   138      if not self.need_cleanup:
   139        return
   140      try:
   141        FileSystems.delete([self.output_folder])
   142      except IOError:
   143        # may not have delete permission, just raise a warning
   144        _LOGGER.warning(
   145            'Unable to delete file %s during cleanup.', self.output_folder)
   146  
   147  
   148  class _TextIOReadPerfTest(LoadTest):
   149    def __init__(self, input_folder):
   150      super().__init__(READ_NAMESPACE)
   151      self.test_options = self.pipeline.get_pipeline_options().view_as(
   152          FileBasedIOTestOptions)
   153      self.input_folder = input_folder
   154  
   155    def test(self):
   156      output = (
   157          self.pipeline
   158          | 'Read from text' >>
   159          ReadFromText(file_pattern=FileSystems.join(self.input_folder, '*'))
   160          | 'Count records' >> beam.ParDo(CountMessages(self.metrics_namespace))
   161          | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))
   162          | 'Count' >> beam.combiners.Count.Globally())
   163      assert_that(output, equal_to([self.input_options['num_records']]))
   164  
   165    def cleanup(self):
   166      try:
   167        #FileSystems.delete([self.input_folder])
   168        pass
   169      except IOError:
   170        # may not have delete permission, just raise a warning
   171        _LOGGER.warning(
   172            'Unable to delete file %s during cleanup.', self.input_folder)
   173  
   174  
   175  if __name__ == '__main__':
   176    logging.basicConfig(level=logging.INFO)
   177  
   178    test_options = TestPipeline().get_pipeline_options().view_as(
   179        FileBasedIOTestOptions)
   180    supported_test_classes = list(
   181        filter(
   182            lambda s: s.endswith('PerfTest') and not s.startswith('_'),
   183            dir(sys.modules[__name__])))
   184  
   185    if test_options.test_class not in supported_test_classes:
   186      raise RuntimeError(
   187          f'Test {test_options.test_class} not found. '
   188          'Supported tests are {supported_test_classes}')
   189  
   190    getattr(sys.modules[__name__], test_options.test_class)().run()