github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filebasedio_perf_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Performance tests for file based io connectors.""" 19 20 import logging 21 import sys 22 import uuid 23 from typing import Tuple 24 25 import apache_beam as beam 26 from apache_beam import typehints 27 from apache_beam.io.filesystems import FileSystems 28 from apache_beam.io.iobase import Read 29 from apache_beam.io.textio import ReadFromText 30 from apache_beam.io.textio import WriteToText 31 from apache_beam.testing.load_tests.load_test import LoadTest 32 from apache_beam.testing.load_tests.load_test import LoadTestOptions 33 from apache_beam.testing.load_tests.load_test_metrics_utils import CountMessages 34 from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureTime 35 from apache_beam.testing.synthetic_pipeline import SyntheticSource 36 from apache_beam.testing.test_pipeline import TestPipeline 37 from apache_beam.testing.util import assert_that 38 from apache_beam.testing.util import equal_to 39 from apache_beam.transforms.util import Reshuffle 40 41 WRITE_NAMESPACE = 'write' 42 READ_NAMESPACE = 'read' 43 44 _LOGGER = logging.getLogger(__name__) 45 46 47 class FileBasedIOTestOptions(LoadTestOptions): 48 @classmethod 49 def _add_argparse_args(cls, parser): 50 parser.add_argument( 51 '--test_class', required=True, help='Test class to run.') 52 parser.add_argument( 53 '--filename_prefix', 54 required=True, 55 help='Destination prefix for files generated by the test.') 56 parser.add_argument( 57 '--compression_type', 58 default='auto', 59 help='File compression type for writing and reading test files.') 60 parser.add_argument( 61 '--number_of_shards', 62 type=int, 63 default=0, 64 help='Number of files this test will create during the write phase.') 65 parser.add_argument( 66 '--dataset_size', 67 type=int, 68 help='Size of data saved on the target filesystem (bytes).') 69 70 71 @typehints.with_output_types(bytes) 72 @typehints.with_input_types(Tuple[bytes, bytes]) 73 class SyntheticRecordToStrFn(beam.DoFn): 74 """ 75 A DoFn that convert key-value bytes from synthetic source to string record. 76 77 It uses base64 to convert random bytes emitted from the synthetic source. 78 Therefore, every 3 bytes give 4 bytes long ascii characters. 79 80 Output length = 4(ceil[len(key)/3] + ceil[len(value)/3]) + 1 81 """ 82 def process(self, element): 83 import base64 84 yield base64.b64encode(element[0]) + b' ' + base64.b64encode(element[1]) 85 86 87 class CreateFolderFn(beam.DoFn): 88 """Create folder at pipeline runtime.""" 89 def __init__(self, folder): 90 self.folder = folder 91 92 def process(self, element): 93 from apache_beam.io.filesystems import FileSystems # pylint: disable=reimported 94 filesystem = FileSystems.get_filesystem(self.folder) 95 if filesystem.has_dirs() and not filesystem.exists(self.folder): 96 filesystem.mkdirs(self.folder) 97 98 99 class TextIOPerfTest: 100 def run(self): 101 write_test = _TextIOWritePerfTest(need_cleanup=False) 102 read_test = _TextIOReadPerfTest(input_folder=write_test.output_folder) 103 write_test.run() 104 read_test.run() 105 106 107 class _TextIOWritePerfTest(LoadTest): 108 def __init__(self, need_cleanup=True): 109 super().__init__(WRITE_NAMESPACE) 110 self.need_cleanup = need_cleanup 111 self.test_options = self.pipeline.get_pipeline_options().view_as( 112 FileBasedIOTestOptions) 113 self.output_folder = FileSystems.join( 114 self.test_options.filename_prefix, str(uuid.uuid4())) 115 116 def test(self): 117 # first makedir if needed 118 _ = ( 119 self.pipeline 120 | beam.Impulse() 121 | beam.ParDo(CreateFolderFn(self.output_folder))) 122 123 # write to text 124 _ = ( 125 self.pipeline 126 | 'Produce rows' >> Read( 127 SyntheticSource(self.parse_synthetic_source_options())) 128 | 'Count records' >> beam.ParDo(CountMessages(self.metrics_namespace)) 129 | 'Format' >> beam.ParDo(SyntheticRecordToStrFn()) 130 | 'Avoid Fusion' >> Reshuffle() 131 | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)) 132 | 'Write Text' >> WriteToText( 133 file_path_prefix=FileSystems.join(self.output_folder, 'test'), 134 compression_type=self.test_options.compression_type, 135 num_shards=self.test_options.number_of_shards)) 136 137 def cleanup(self): 138 if not self.need_cleanup: 139 return 140 try: 141 FileSystems.delete([self.output_folder]) 142 except IOError: 143 # may not have delete permission, just raise a warning 144 _LOGGER.warning( 145 'Unable to delete file %s during cleanup.', self.output_folder) 146 147 148 class _TextIOReadPerfTest(LoadTest): 149 def __init__(self, input_folder): 150 super().__init__(READ_NAMESPACE) 151 self.test_options = self.pipeline.get_pipeline_options().view_as( 152 FileBasedIOTestOptions) 153 self.input_folder = input_folder 154 155 def test(self): 156 output = ( 157 self.pipeline 158 | 'Read from text' >> 159 ReadFromText(file_pattern=FileSystems.join(self.input_folder, '*')) 160 | 'Count records' >> beam.ParDo(CountMessages(self.metrics_namespace)) 161 | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)) 162 | 'Count' >> beam.combiners.Count.Globally()) 163 assert_that(output, equal_to([self.input_options['num_records']])) 164 165 def cleanup(self): 166 try: 167 #FileSystems.delete([self.input_folder]) 168 pass 169 except IOError: 170 # may not have delete permission, just raise a warning 171 _LOGGER.warning( 172 'Unable to delete file %s during cleanup.', self.input_folder) 173 174 175 if __name__ == '__main__': 176 logging.basicConfig(level=logging.INFO) 177 178 test_options = TestPipeline().get_pipeline_options().view_as( 179 FileBasedIOTestOptions) 180 supported_test_classes = list( 181 filter( 182 lambda s: s.endswith('PerfTest') and not s.startswith('_'), 183 dir(sys.modules[__name__]))) 184 185 if test_options.test_class not in supported_test_classes: 186 raise RuntimeError( 187 f'Test {test_options.test_class} not found. ' 188 'Supported tests are {supported_test_classes}') 189 190 getattr(sys.modules[__name__], test_options.test_class)().run()