github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filebasedsink_test.py (about) 1 # -*- coding: utf-8 -*- 2 # 3 # Licensed to the Apache Software Foundation (ASF) under one or more 4 # contributor license agreements. See the NOTICE file distributed with 5 # this work for additional information regarding copyright ownership. 6 # The ASF licenses this file to You under the Apache License, Version 2.0 7 # (the "License"); you may not use this file except in compliance with 8 # the License. You may obtain a copy of the License at 9 # 10 # http://www.apache.org/licenses/LICENSE-2.0 11 # 12 # Unless required by applicable law or agreed to in writing, software 13 # distributed under the License is distributed on an "AS IS" BASIS, 14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 # See the License for the specific language governing permissions and 16 # limitations under the License. 17 # 18 19 """Unit tests for file sinks.""" 20 21 # pytype: skip-file 22 23 import glob 24 import logging 25 import os 26 import shutil 27 import tempfile 28 import unittest 29 30 import hamcrest as hc 31 import mock 32 33 import apache_beam as beam 34 from apache_beam.coders import coders 35 from apache_beam.io import filebasedsink 36 from apache_beam.io.filesystem import BeamIOError 37 from apache_beam.options.value_provider import StaticValueProvider 38 from apache_beam.testing.test_pipeline import TestPipeline 39 from apache_beam.transforms.display import DisplayData 40 from apache_beam.transforms.display_test import DisplayDataItemMatcher 41 42 _LOGGER = logging.getLogger(__name__) 43 44 45 # TODO: Refactor code so all io tests are using same library 46 # TestCaseWithTempDirCleanup class. 47 class _TestCaseWithTempDirCleanUp(unittest.TestCase): 48 """Base class for TestCases that deals with TempDir clean-up. 49 50 Inherited test cases will call self._new_tempdir() to start a temporary dir 51 which will be deleted at the end of the tests (when tearDown() is called). 52 """ 53 def setUp(self): 54 self._tempdirs = [] 55 56 def tearDown(self): 57 for path in self._tempdirs: 58 if os.path.exists(path): 59 shutil.rmtree(path) 60 self._tempdirs = [] 61 62 def _new_tempdir(self): 63 result = tempfile.mkdtemp() 64 self._tempdirs.append(result) 65 return result 66 67 def _create_temp_file(self, name='', suffix='', dir=None, content=None): 68 if not name: 69 name = tempfile.template 70 if not dir: 71 dir = self._new_tempdir() 72 file_name = tempfile.NamedTemporaryFile( 73 delete=False, prefix=name, dir=dir, suffix=suffix).name 74 75 if content: 76 with open(file_name, 'w') as f: 77 f.write(content) 78 return file_name 79 80 81 class MyFileBasedSink(filebasedsink.FileBasedSink): 82 def open(self, temp_path): 83 # TODO: Fix main session pickling. 84 # file_handle = super().open(temp_path) 85 file_handle = filebasedsink.FileBasedSink.open(self, temp_path) 86 file_handle.write(b'[start]') 87 return file_handle 88 89 def write_encoded_record(self, file_handle, encoded_value): 90 file_handle.write(b'[') 91 file_handle.write(encoded_value) 92 file_handle.write(b']') 93 94 def close(self, file_handle): 95 file_handle.write(b'[end]') 96 # TODO: Fix main session pickling. 97 # file_handle = super().close(file_handle) 98 file_handle = filebasedsink.FileBasedSink.close(self, file_handle) 99 100 101 class TestFileBasedSink(_TestCaseWithTempDirCleanUp): 102 def _common_init(self, sink): 103 # Manually invoke the generic Sink API. 104 init_token = sink.initialize_write() 105 106 writer1 = sink.open_writer(init_token, '1') 107 writer1.write('a') 108 writer1.write('b') 109 res1 = writer1.close() 110 111 writer2 = sink.open_writer(init_token, '2') 112 writer2.write('x') 113 writer2.write('y') 114 writer2.write('z') 115 res2 = writer2.close() 116 117 return init_token, [res1, res2] 118 119 def test_file_sink_writing(self): 120 temp_path = os.path.join(self._new_tempdir(), 'FileBasedSink') 121 sink = MyFileBasedSink( 122 temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder()) 123 124 init_token, writer_results = self._common_init(sink) 125 126 pre_finalize_results = sink.pre_finalize(init_token, writer_results) 127 finalize_res1 = list( 128 sink.finalize_write(init_token, writer_results, pre_finalize_results)) 129 # Retry the finalize operation (as if the first attempt was lost). 130 finalize_res2 = list( 131 sink.finalize_write(init_token, writer_results, pre_finalize_results)) 132 133 # Check the results. 134 shard1 = temp_path + '-00000-of-00002.output' 135 shard2 = temp_path + '-00001-of-00002.output' 136 self.assertEqual(finalize_res1, [shard1, shard2]) 137 self.assertEqual(finalize_res2, []) 138 self.assertEqual(open(shard1).read(), '[start][a][b][end]') 139 self.assertEqual(open(shard2).read(), '[start][x][y][z][end]') 140 141 # Check that any temp files are deleted. 142 self.assertCountEqual([shard1, shard2], glob.glob(temp_path + '*')) 143 144 def test_file_sink_display_data(self): 145 temp_path = os.path.join(self._new_tempdir(), 'display') 146 sink = MyFileBasedSink( 147 temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder()) 148 dd = DisplayData.create_from(sink) 149 expected_items = [ 150 DisplayDataItemMatcher('compression', 'auto'), 151 DisplayDataItemMatcher( 152 'file_pattern', 153 '{}{}'.format( 154 temp_path, '-%(shard_num)05d-of-%(num_shards)05d.output')) 155 ] 156 hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items)) 157 158 def test_empty_write(self): 159 temp_path = tempfile.NamedTemporaryFile().name 160 sink = MyFileBasedSink( 161 temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder()) 162 with TestPipeline() as p: 163 p | beam.Create([]) | beam.io.Write(sink) # pylint: disable=expression-not-assigned 164 self.assertEqual( 165 open(temp_path + '-00000-of-00001.output').read(), '[start][end]') 166 167 def test_static_value_provider_empty_write(self): 168 temp_path = StaticValueProvider( 169 value_type=str, value=tempfile.NamedTemporaryFile().name) 170 sink = MyFileBasedSink( 171 temp_path, 172 file_name_suffix=StaticValueProvider(value_type=str, value='.output'), 173 coder=coders.ToBytesCoder()) 174 with TestPipeline() as p: 175 p | beam.Create([]) | beam.io.Write(sink) # pylint: disable=expression-not-assigned 176 self.assertEqual( 177 open(temp_path.get() + '-00000-of-00001.output').read(), '[start][end]') 178 179 def test_fixed_shard_write(self): 180 temp_path = os.path.join(self._new_tempdir(), 'empty') 181 sink = MyFileBasedSink( 182 temp_path, 183 file_name_suffix='.output', 184 num_shards=3, 185 shard_name_template='_NN_SSS_', 186 coder=coders.ToBytesCoder()) 187 with TestPipeline() as p: 188 p | beam.Create(['a', 'b']) | beam.io.Write(sink) # pylint: disable=expression-not-assigned 189 190 concat = ''.join( 191 open(temp_path + '_03_%03d_.output' % shard_num).read() 192 for shard_num in range(3)) 193 self.assertTrue('][a][' in concat, concat) 194 self.assertTrue('][b][' in concat, concat) 195 196 # Not using 'test' in name so that 'nose' doesn't pick this as a test. 197 def run_temp_dir_check( 198 self, 199 no_dir_path, 200 dir_path, 201 no_dir_root_path, 202 dir_root_path, 203 prefix, 204 separator): 205 def _get_temp_dir(file_path_prefix): 206 sink = MyFileBasedSink( 207 file_path_prefix, 208 file_name_suffix='.output', 209 coder=coders.ToBytesCoder()) 210 return sink.initialize_write() 211 212 temp_dir = _get_temp_dir(no_dir_path) 213 self.assertTrue(temp_dir.startswith(prefix)) 214 last_sep = temp_dir.rfind(separator) 215 self.assertTrue(temp_dir[last_sep + 1:].startswith('beam-temp')) 216 217 temp_dir = _get_temp_dir(dir_path) 218 self.assertTrue(temp_dir.startswith(prefix)) 219 last_sep = temp_dir.rfind(separator) 220 self.assertTrue(temp_dir[last_sep + 1:].startswith('beam-temp')) 221 222 with self.assertRaises(ValueError): 223 _get_temp_dir(no_dir_root_path) 224 225 with self.assertRaises(ValueError): 226 _get_temp_dir(dir_root_path) 227 228 def test_temp_dir_uniqueness(self): 229 temp_path = os.path.join(self._new_tempdir(), 'unique') 230 sink = MyFileBasedSink(temp_path, coder=coders.ToBytesCoder()) 231 init_list = [''] * 1000 232 temp_dir_list = [sink._create_temp_dir(temp_path) for _ in init_list] 233 temp_dir_set = set(temp_dir_list) 234 self.assertEqual(len(temp_dir_list), len(temp_dir_set)) 235 236 def test_temp_dir_gcs(self): 237 try: 238 self.run_temp_dir_check( 239 'gs://aaa/bbb', 240 'gs://aaa/bbb/', 241 'gs://aaa', 242 'gs://aaa/', 243 'gs://', 244 '/') 245 except ValueError: 246 _LOGGER.debug('Ignoring test since GCP module is not installed') 247 248 @mock.patch('apache_beam.io.localfilesystem.os') 249 def test_temp_dir_local(self, filesystem_os_mock): 250 # Here we test a unix-like mock file-system 251 # (not really testing Unix or Windows since we mock the function of 'os' 252 # module). 253 254 def _fake_unix_split(path): 255 sep = path.rfind('/') 256 if sep < 0: 257 raise ValueError('Path must contain a separator') 258 return (path[:sep], path[sep + 1:]) 259 260 def _fake_unix_join(base, path): 261 return base + '/' + path 262 263 filesystem_os_mock.path.abspath = lambda a: a 264 filesystem_os_mock.path.split.side_effect = _fake_unix_split 265 filesystem_os_mock.path.join.side_effect = _fake_unix_join 266 self.run_temp_dir_check('/aaa/bbb', '/aaa/bbb/', '/', '/', '/', '/') 267 268 def test_file_sink_multi_shards(self): 269 temp_path = os.path.join(self._new_tempdir(), 'multishard') 270 sink = MyFileBasedSink( 271 temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder()) 272 273 # Manually invoke the generic Sink API. 274 init_token = sink.initialize_write() 275 276 num_shards = 1000 277 writer_results = [] 278 for i in range(num_shards): 279 uuid = 'uuid-%05d' % i 280 writer = sink.open_writer(init_token, uuid) 281 writer.write('a') 282 writer.write('b') 283 writer.write(uuid) 284 writer_results.append(writer.close()) 285 286 pre_finalize_results = sink.pre_finalize(init_token, writer_results) 287 res = sorted( 288 sink.finalize_write(init_token, writer_results, pre_finalize_results)) 289 290 for i in range(num_shards): 291 shard_name = '%s-%05d-of-%05d.output' % (temp_path, i, num_shards) 292 uuid = 'uuid-%05d' % i 293 self.assertEqual(res[i], shard_name) 294 self.assertEqual( 295 open(shard_name).read(), ('[start][a][b][%s][end]' % uuid)) 296 297 # Check that any temp files are deleted. 298 self.assertCountEqual(res, glob.glob(temp_path + '*')) 299 300 @mock.patch.object(filebasedsink.FileSystems, 'rename') 301 def test_file_sink_rename_error(self, rename_mock): 302 temp_path = os.path.join(self._new_tempdir(), 'rename_error') 303 sink = MyFileBasedSink( 304 temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder()) 305 init_token, writer_results = self._common_init(sink) 306 pre_finalize_results = sink.pre_finalize(init_token, writer_results) 307 308 error_str = 'mock rename error description' 309 rename_mock.side_effect = BeamIOError( 310 'mock rename error', {('src', 'dst'): error_str}) 311 with self.assertRaisesRegex(Exception, error_str): 312 list( 313 sink.finalize_write(init_token, writer_results, pre_finalize_results)) 314 315 def test_file_sink_src_missing(self): 316 temp_path = os.path.join(self._new_tempdir(), 'src_missing') 317 sink = MyFileBasedSink( 318 temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder()) 319 init_token, writer_results = self._common_init(sink) 320 pre_finalize_results = sink.pre_finalize(init_token, writer_results) 321 322 os.remove(writer_results[0]) 323 with self.assertRaisesRegex(Exception, r'not exist'): 324 list( 325 sink.finalize_write(init_token, writer_results, pre_finalize_results)) 326 327 def test_file_sink_dst_matches_src(self): 328 temp_path = os.path.join(self._new_tempdir(), 'dst_matches_src') 329 sink = MyFileBasedSink( 330 temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder()) 331 init_token, [res1, res2] = self._common_init(sink) 332 333 pre_finalize_results = sink.pre_finalize(init_token, [res1, res2]) 334 list(sink.finalize_write(init_token, [res1, res2], pre_finalize_results)) 335 336 self.assertFalse(os.path.exists(res1)) 337 self.assertFalse(os.path.exists(res2)) 338 shard1 = temp_path + '-00000-of-00002.output' 339 shard2 = temp_path + '-00001-of-00002.output' 340 self.assertEqual(open(shard1).read(), '[start][a][b][end]') 341 self.assertEqual(open(shard2).read(), '[start][x][y][z][end]') 342 343 os.makedirs(os.path.dirname(res1)) 344 shutil.copyfile(shard1, res1) 345 shutil.copyfile(shard2, res2) 346 list(sink.finalize_write(init_token, [res1, res2], pre_finalize_results)) 347 348 def test_pre_finalize(self): 349 temp_path = os.path.join(self._new_tempdir(), 'pre_finalize') 350 sink = MyFileBasedSink( 351 temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder()) 352 init_token, [res1, res2] = self._common_init(sink) 353 354 # no-op 355 sink.pre_finalize(init_token, [res1, res2]) 356 357 # Create finalized outputs from a previous run, which pre_finalize should 358 # delete. 359 shard1 = temp_path + '-00000-of-00002.output' 360 shard2 = temp_path + '-00001-of-00002.output' 361 with open(shard1, 'w') as f: 362 f.write('foo') 363 with open(shard2, 'w') as f: 364 f.write('foo') 365 self.assertTrue(os.path.exists(res1)) 366 self.assertTrue(os.path.exists(res2)) 367 self.assertTrue(os.path.exists(shard1)) 368 self.assertTrue(os.path.exists(shard2)) 369 370 sink.pre_finalize(init_token, [res1, res2]) 371 self.assertTrue(os.path.exists(res1)) 372 self.assertTrue(os.path.exists(res2)) 373 self.assertFalse(os.path.exists(shard1)) 374 self.assertFalse(os.path.exists(shard2)) 375 376 @mock.patch.object(filebasedsink.FileSystems, 'delete') 377 def test_pre_finalize_error(self, delete_mock): 378 temp_path = os.path.join(self._new_tempdir(), 'pre_finalize') 379 sink = MyFileBasedSink( 380 temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder()) 381 init_token, [res1, res2] = self._common_init(sink) 382 383 # no-op 384 sink.pre_finalize(init_token, [res1, res2]) 385 386 # Create finalized outputs from a previous run, which pre_finalize should 387 # delete. 388 shard1 = temp_path + '-00000-of-00002.output' 389 shard2 = temp_path + '-00001-of-00002.output' 390 with open(shard1, 'w') as f: 391 f.write('foo') 392 with open(shard2, 'w') as f: 393 f.write('foo') 394 395 error_str = 'mock rename error description' 396 delete_mock.side_effect = BeamIOError( 397 'mock rename error', {shard2: error_str}) 398 with self.assertRaisesRegex(Exception, error_str): 399 sink.pre_finalize(init_token, [res1, res2]) 400 401 402 if __name__ == '__main__': 403 logging.getLogger().setLevel(logging.INFO) 404 unittest.main()