github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filesystem_test.py (about) 1 # -*- coding: utf-8 -*- 2 # 3 # Licensed to the Apache Software Foundation (ASF) under one or more 4 # contributor license agreements. See the NOTICE file distributed with 5 # this work for additional information regarding copyright ownership. 6 # The ASF licenses this file to You under the Apache License, Version 2.0 7 # (the "License"); you may not use this file except in compliance with 8 # the License. You may obtain a copy of the License at 9 # 10 # http://www.apache.org/licenses/LICENSE-2.0 11 # 12 # Unless required by applicable law or agreed to in writing, software 13 # distributed under the License is distributed on an "AS IS" BASIS, 14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 # See the License for the specific language governing permissions and 16 # limitations under the License. 17 # 18 19 """Unit tests for filesystem module.""" 20 # pytype: skip-file 21 22 import bz2 23 import gzip 24 import logging 25 import lzma 26 import ntpath 27 import os 28 import posixpath 29 import sys 30 import tempfile 31 import unittest 32 import zlib 33 from io import BytesIO 34 35 import zstandard 36 from parameterized import param 37 from parameterized import parameterized 38 39 from apache_beam.io.filesystem import CompressedFile 40 from apache_beam.io.filesystem import CompressionTypes 41 from apache_beam.io.filesystem import FileMetadata 42 from apache_beam.io.filesystem import FileSystem 43 44 45 class TestingFileSystem(FileSystem): 46 def __init__(self, pipeline_options, has_dirs=False): 47 super().__init__(pipeline_options) 48 self._has_dirs = has_dirs 49 self._files = {} 50 51 @classmethod 52 def scheme(cls): 53 # Required for FileSystems.get_filesystem(). 54 return 'test' 55 56 def join(self, basepath, *paths): 57 raise NotImplementedError 58 59 def split(self, path): 60 raise NotImplementedError 61 62 def mkdirs(self, path): 63 raise NotImplementedError 64 65 def has_dirs(self): 66 return self._has_dirs 67 68 def _insert_random_file(self, path, size): 69 self._files[path] = size 70 71 def _list(self, dir_or_prefix): 72 for path, size in self._files.items(): 73 if path.startswith(dir_or_prefix): 74 yield FileMetadata(path, size) 75 76 def create( 77 self, 78 path, 79 mime_type='application/octet-stream', 80 compression_type=CompressionTypes.AUTO): 81 raise NotImplementedError 82 83 def open( 84 self, 85 path, 86 mime_type='application/octet-stream', 87 compression_type=CompressionTypes.AUTO): 88 raise NotImplementedError 89 90 def copy(self, source_file_names, destination_file_names): 91 raise NotImplementedError 92 93 def rename(self, source_file_names, destination_file_names): 94 raise NotImplementedError 95 96 def exists(self, path): 97 raise NotImplementedError 98 99 def size(self, path): 100 raise NotImplementedError 101 102 def last_updated(self, path): 103 raise NotImplementedError 104 105 def checksum(self, path): 106 raise NotImplementedError 107 108 def metadata(self, path): 109 raise NotImplementedError 110 111 def delete(self, paths): 112 raise NotImplementedError 113 114 115 class TestFileSystem(unittest.TestCase): 116 def setUp(self): 117 self.fs = TestingFileSystem(pipeline_options=None) 118 119 def _flatten_match(self, match_results): 120 return [ 121 file_metadata for match_result in match_results 122 for file_metadata in match_result.metadata_list 123 ] 124 125 @parameterized.expand([ 126 ('gs://gcsio-test/**', all), 127 # Does not match root-level files 128 ('gs://gcsio-test/**/*', lambda n, i: n not in ['cat.png']), 129 # Only matches root-level files 130 ('gs://gcsio-test/*', [('cat.png', 19)]), 131 ( 132 'gs://gcsio-test/cow/**', [ 133 ('cow/cat/fish', 2), 134 ('cow/cat/blubber', 3), 135 ('cow/dog/blubber', 4), 136 ]), 137 ( 138 'gs://gcsio-test/cow/ca**', [ 139 ('cow/cat/fish', 2), 140 ('cow/cat/blubber', 3), 141 ]), 142 ( 143 'gs://gcsio-test/apple/[df]ish/ca*', 144 [ 145 ('apple/fish/cat', 10), 146 ('apple/fish/cart', 11), 147 ('apple/fish/carl', 12), 148 ('apple/dish/cat', 14), 149 ('apple/dish/carl', 15), 150 ]), 151 ( 152 'gs://gcsio-test/apple/?ish/?a?', 153 [ 154 ('apple/fish/cat', 10), 155 ('apple/dish/bat', 13), 156 ('apple/dish/cat', 14), 157 ]), 158 ( 159 'gs://gcsio-test/apple/fish/car?', [ 160 ('apple/fish/cart', 11), 161 ('apple/fish/carl', 12), 162 ]), 163 ( 164 'gs://gcsio-test/apple/fish/b*', 165 [ 166 ('apple/fish/blubber', 6), 167 ('apple/fish/blowfish', 7), 168 ('apple/fish/bambi', 8), 169 ('apple/fish/balloon', 9), 170 ]), 171 ( 172 'gs://gcsio-test/apple/f*/b*', 173 [ 174 ('apple/fish/blubber', 6), 175 ('apple/fish/blowfish', 7), 176 ('apple/fish/bambi', 8), 177 ('apple/fish/balloon', 9), 178 ]), 179 ( 180 'gs://gcsio-test/apple/dish/[cb]at', [ 181 ('apple/dish/bat', 13), 182 ('apple/dish/cat', 14), 183 ]), 184 ( 185 'gs://gcsio-test/banana/cyrano.m?', [ 186 ('banana/cyrano.md', 17), 187 ('banana/cyrano.mb', 18), 188 ]), 189 ]) 190 def test_match_glob(self, file_pattern, expected_object_names): 191 objects = [ 192 ('cow/cat/fish', 2), ('cow/cat/blubber', 3), ('cow/dog/blubber', 4), 193 ('apple/dog/blubber', 5), ('apple/fish/blubber', 6), 194 ('apple/fish/blowfish', 7), ('apple/fish/bambi', 8), 195 ('apple/fish/balloon', 9), ('apple/fish/cat', 10), 196 ('apple/fish/cart', 11), ('apple/fish/carl', 12), 197 ('apple/dish/bat', 13), ('apple/dish/cat', 14), ('apple/dish/carl', 15), 198 ('banana/cat', 16), ('banana/cyrano.md', 17), ('banana/cyrano.mb', 199 18), ('cat.png', 19) 200 ] 201 bucket_name = 'gcsio-test' 202 203 if callable(expected_object_names): 204 # A hack around the fact that the parameters do not have access to 205 # the "objects" list. 206 207 if expected_object_names is all: 208 # It's a placeholder for "all" objects 209 expected_object_names = objects 210 else: 211 # It's a filter function of type (str, int) -> bool 212 # that returns true for expected objects 213 filter_func = expected_object_names 214 expected_object_names = [(short_path, size) for short_path, 215 size in objects 216 if filter_func(short_path, size)] 217 218 for object_name, size in objects: 219 file_name = 'gs://%s/%s' % (bucket_name, object_name) 220 self.fs._insert_random_file(file_name, size) 221 222 expected_file_names = [('gs://%s/%s' % (bucket_name, object_name), size) 223 for object_name, 224 size in expected_object_names] 225 actual_file_names = [ 226 (file_metadata.path, file_metadata.size_in_bytes) 227 for file_metadata in self._flatten_match(self.fs.match([file_pattern])) 228 ] 229 230 self.assertEqual(set(actual_file_names), set(expected_file_names)) 231 232 # Check if limits are followed correctly 233 limit = 3 234 expected_num_items = min(len(expected_object_names), limit) 235 self.assertEqual( 236 len(self._flatten_match(self.fs.match([file_pattern], [limit]))), 237 expected_num_items) 238 239 @parameterized.expand([ 240 param( 241 os_path=posixpath, 242 # re.escape does not escape forward slashes since Python 3.7 243 # https://docs.python.org/3/whatsnew/3.7.html ("bpo-29995") 244 sep_re='\\/' if sys.version_info < (3, 7, 0) else '/'), 245 param(os_path=ntpath, sep_re='\\\\'), 246 ]) 247 def test_translate_pattern(self, os_path, sep_re): 248 star = r'[^/\\]*' 249 double_star = r'.*' 250 join = os_path.join 251 252 sep = os_path.sep 253 pattern__expected = [ 254 (join('a', '*'), sep_re.join(['a', star])), 255 (join('b', '*') + sep, sep_re.join(['b', star]) + sep_re), 256 (r'*[abc\]', star + r'[abc\\]'), 257 (join('d', '**', '*'), sep_re.join(['d', double_star, star])), 258 ] 259 for pattern, expected in pattern__expected: 260 expected = r'(?ms)' + expected + r'\Z' 261 result = self.fs.translate_pattern(pattern) 262 self.assertEqual(expected, result) 263 264 265 class TestFileSystemWithDirs(TestFileSystem): 266 def setUp(self): 267 self.fs = TestingFileSystem(pipeline_options=None, has_dirs=True) 268 269 270 class TestCompressedFile(unittest.TestCase): 271 """Base class for TestCases that deals with TempDir clean-up. 272 273 Inherited test cases will call self._new_tempdir() to start a temporary dir 274 which will be deleted at the end of the tests (when tearDown() is called). 275 """ 276 277 content = b"""- the BEAM - 278 How things really are we would like to know. 279 Does 280 Time 281 flow, is it elastic, or is it 282 atomized in instants hammered around the 283 clock's face? ... 284 - May Swenson""" 285 286 # Keep the read block size small so that we exercise the seek functionality 287 # in compressed file and not just in the internal buffer 288 read_block_size = 4 289 290 def setUp(self): 291 self._tempfiles = [] 292 293 def tearDown(self): 294 for path in self._tempfiles: 295 if os.path.exists(path): 296 os.remove(path) 297 298 def _create_temp_file(self): 299 path = tempfile.NamedTemporaryFile(delete=False).name 300 self._tempfiles.append(path) 301 return path 302 303 def _create_compressed_file(self, compression_type, content): 304 file_name = self._create_temp_file() 305 306 if compression_type == CompressionTypes.DEFLATE: 307 with open(file_name, 'wb') as f: 308 f.write(zlib.compress(content)) 309 elif compression_type == CompressionTypes.BZIP2 or \ 310 compression_type == CompressionTypes.GZIP: 311 compress_open = bz2.BZ2File \ 312 if compression_type == CompressionTypes.BZIP2 \ 313 else gzip.open 314 with compress_open(file_name, 'wb') as f: 315 f.write(content) 316 elif compression_type == CompressionTypes.ZSTD: 317 compress_open = zstandard.open 318 with compress_open(file_name, 'wb') as f: 319 f.write(content) 320 elif compression_type == CompressionTypes.LZMA: 321 compress_open = lzma.open 322 with compress_open(file_name, 'wb') as f: 323 f.write(content) 324 else: 325 assert False, "Invalid compression type: %s" % compression_type 326 327 return file_name 328 329 def test_seekable_enabled_on_read(self): 330 with open(self._create_temp_file(), 'rb') as f: 331 readable = CompressedFile(f) 332 self.assertTrue(readable.seekable) 333 334 def test_seekable_disabled_on_write(self): 335 with open(self._create_temp_file(), 'wb') as f: 336 writeable = CompressedFile(f) 337 self.assertFalse(writeable.seekable) 338 339 def test_seekable_disabled_on_append(self): 340 with open(self._create_temp_file(), 'ab') as f: 341 writeable = CompressedFile(f) 342 self.assertFalse(writeable.seekable) 343 344 def test_seek_set(self): 345 for compression_type in [CompressionTypes.BZIP2, 346 CompressionTypes.DEFLATE, 347 CompressionTypes.GZIP, 348 CompressionTypes.ZSTD, 349 CompressionTypes.LZMA]: 350 file_name = self._create_compressed_file(compression_type, self.content) 351 with open(file_name, 'rb') as f: 352 compressed_fd = CompressedFile( 353 f, compression_type, read_size=self.read_block_size) 354 reference_fd = BytesIO(self.content) 355 356 # Note: BytesIO's tell() reports out of bound positions (if we seek 357 # beyond the file), therefore we need to cap it to max_position 358 # _CompressedFile.tell() always stays within the bounds of the 359 # uncompressed content. 360 # Negative seek position argument is not supported for BytesIO with 361 # whence set to SEEK_SET. 362 for seek_position in (0, 363 1, 364 len(self.content) - 1, 365 len(self.content), 366 len(self.content) + 1): 367 compressed_fd.seek(seek_position, os.SEEK_SET) 368 reference_fd.seek(seek_position, os.SEEK_SET) 369 370 uncompressed_line = compressed_fd.readline() 371 reference_line = reference_fd.readline() 372 self.assertEqual(uncompressed_line, reference_line) 373 374 uncompressed_position = compressed_fd.tell() 375 reference_position = reference_fd.tell() 376 max_position = len(self.content) 377 reference_position = min(reference_position, max_position) 378 self.assertEqual(uncompressed_position, reference_position) 379 380 def test_seek_cur(self): 381 for compression_type in [CompressionTypes.BZIP2, 382 CompressionTypes.DEFLATE, 383 CompressionTypes.GZIP, 384 CompressionTypes.ZSTD, 385 CompressionTypes.LZMA]: 386 file_name = self._create_compressed_file(compression_type, self.content) 387 with open(file_name, 'rb') as f: 388 compressed_fd = CompressedFile( 389 f, compression_type, read_size=self.read_block_size) 390 reference_fd = BytesIO(self.content) 391 392 # Test out of bound, inbound seeking in both directions 393 # Note: BytesIO's seek() reports out of bound positions (if we seek 394 # beyond the file), therefore we need to cap it to max_position (to 395 # make it consistent with the old StringIO behavior 396 for seek_position in (-1, 397 0, 398 1, 399 len(self.content) // 2, 400 len(self.content) // 2, 401 -1 * len(self.content) // 2): 402 compressed_fd.seek(seek_position, os.SEEK_CUR) 403 reference_fd.seek(seek_position, os.SEEK_CUR) 404 405 uncompressed_line = compressed_fd.readline() 406 expected_line = reference_fd.readline() 407 self.assertEqual(uncompressed_line, expected_line) 408 409 reference_position = reference_fd.tell() 410 uncompressed_position = compressed_fd.tell() 411 max_position = len(self.content) 412 reference_position = min(reference_position, max_position) 413 reference_fd.seek(reference_position, os.SEEK_SET) 414 self.assertEqual(uncompressed_position, reference_position) 415 416 def test_read_from_end_returns_no_data(self): 417 for compression_type in [CompressionTypes.BZIP2, 418 CompressionTypes.DEFLATE, 419 CompressionTypes.GZIP, 420 CompressionTypes.ZSTD, 421 CompressionTypes.LZMA]: 422 file_name = self._create_compressed_file(compression_type, self.content) 423 with open(file_name, 'rb') as f: 424 compressed_fd = CompressedFile( 425 f, compression_type, read_size=self.read_block_size) 426 427 seek_position = 0 428 compressed_fd.seek(seek_position, os.SEEK_END) 429 430 expected_data = b'' 431 uncompressed_data = compressed_fd.read(10) 432 433 self.assertEqual(uncompressed_data, expected_data) 434 435 def test_seek_outside(self): 436 for compression_type in [CompressionTypes.BZIP2, 437 CompressionTypes.DEFLATE, 438 CompressionTypes.GZIP, 439 CompressionTypes.ZSTD, 440 CompressionTypes.LZMA]: 441 file_name = self._create_compressed_file(compression_type, self.content) 442 with open(file_name, 'rb') as f: 443 compressed_fd = CompressedFile( 444 f, compression_type, read_size=self.read_block_size) 445 446 for whence in (os.SEEK_CUR, os.SEEK_SET, os.SEEK_END): 447 seek_position = -1 * len(self.content) - 10 448 compressed_fd.seek(seek_position, whence) 449 450 expected_position = 0 451 uncompressed_position = compressed_fd.tell() 452 self.assertEqual(uncompressed_position, expected_position) 453 454 seek_position = len(self.content) + 20 455 compressed_fd.seek(seek_position, whence) 456 457 expected_position = len(self.content) 458 uncompressed_position = compressed_fd.tell() 459 self.assertEqual(uncompressed_position, expected_position) 460 461 def test_read_and_seek_back_to_beginning(self): 462 for compression_type in [CompressionTypes.BZIP2, 463 CompressionTypes.DEFLATE, 464 CompressionTypes.GZIP, 465 CompressionTypes.ZSTD, 466 CompressionTypes.LZMA]: 467 file_name = self._create_compressed_file(compression_type, self.content) 468 with open(file_name, 'rb') as f: 469 compressed_fd = CompressedFile( 470 f, compression_type, read_size=self.read_block_size) 471 472 first_pass = compressed_fd.readline() 473 compressed_fd.seek(0, os.SEEK_SET) 474 second_pass = compressed_fd.readline() 475 476 self.assertEqual(first_pass, second_pass) 477 478 def test_tell(self): 479 lines = [b'line%d\n' % i for i in range(10)] 480 tmpfile = self._create_temp_file() 481 with open(tmpfile, 'wb') as f: 482 writeable = CompressedFile(f) 483 current_offset = 0 484 for line in lines: 485 writeable.write(line) 486 current_offset += len(line) 487 self.assertEqual(current_offset, writeable.tell()) 488 489 with open(tmpfile, 'rb') as f: 490 readable = CompressedFile(f) 491 current_offset = 0 492 while True: 493 line = readable.readline() 494 current_offset += len(line) 495 self.assertEqual(current_offset, readable.tell()) 496 if not line: 497 break 498 499 def test_concatenated_compressed_file(self): 500 # The test apache_beam.io.textio_test.test_read_gzip_concat 501 # does not encounter the problem in the Beam 2.13 and earlier 502 # code base because the test data is too small: the data is 503 # smaller than read_size, so it goes through logic in the code 504 # that avoids the problem in the code. So, this test sets 505 # read_size smaller and test data bigger, in order to 506 # encounter the problem. It would be difficult to test in the 507 # textio_test module, because you'd need very large test data 508 # because default read_size is 16MiB, and the ReadFromText 509 # interface does not allow you to modify the read_size. 510 import random 511 import threading 512 from six import int2byte 513 num_test_lines = 10 514 timeout = 30 515 read_size = (64 << 10) # set much smaller than the line size 516 byte_table = tuple(int2byte(i) for i in range(32, 96)) 517 518 def generate_random_line(): 519 byte_list = list( 520 b for i in range(4096) for b in random.sample(byte_table, 64)) 521 byte_list.append(b'\n') 522 return b''.join(byte_list) 523 524 def create_test_file(compression_type, lines): 525 filenames = [] 526 file_name = self._create_temp_file() 527 if compression_type == CompressionTypes.BZIP2: 528 compress_factory = bz2.BZ2File 529 elif compression_type == CompressionTypes.GZIP: 530 compress_factory = gzip.open 531 elif compression_type == CompressionTypes.ZSTD: 532 compress_factory = zstandard.open 533 elif compression_type == CompressionTypes.LZMA: 534 compress_factory = lzma.open 535 else: 536 assert False, "Invalid compression type: %s" % compression_type 537 for line in lines: 538 filenames.append(self._create_temp_file()) 539 with compress_factory(filenames[-1], 'wb') as f: 540 f.write(line) 541 with open(file_name, 'wb') as o: 542 for name in filenames: 543 with open(name, 'rb') as i: 544 o.write(i.read()) 545 return file_name 546 547 # I remember some time ago when a job ran with a real concatenated 548 # gzip file, I got into an endless loop in the beam filesystem module. 549 # That's why I put this handler in to trap an endless loop. However, 550 # this unit test doesn't encounter an endless loop, it encounters a 551 # different error, in the Beam 2.13 and earlier implementation. 552 # So it's not strictly necessary to have this handler in this unit test. 553 554 def timeout_handler(): 555 raise IOError('Exiting due to likley infinite loop logic in code.') 556 557 timer = threading.Timer(timeout, timeout_handler) 558 try: 559 test_lines = tuple(generate_random_line() for i in range(num_test_lines)) 560 for compression_type in [CompressionTypes.BZIP2, 561 CompressionTypes.GZIP, 562 CompressionTypes.ZSTD, 563 CompressionTypes.LZMA]: 564 file_name = create_test_file(compression_type, test_lines) 565 timer.start() 566 with open(file_name, 'rb') as f: 567 data = CompressedFile(f, compression_type, read_size=read_size) 568 for written_line in test_lines: 569 read_line = data.readline() 570 self.assertEqual(written_line, read_line) 571 timer.cancel() 572 # Starting a new timer for the next iteration/test. 573 timer = threading.Timer(timeout, timeout_handler) 574 finally: 575 timer.cancel() 576 577 578 if __name__ == '__main__': 579 logging.getLogger().setLevel(logging.INFO) 580 unittest.main()