github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filesystem_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filesystem_test.py (about)

     1  # -*- coding: utf-8 -*-
     2  #
     3  # Licensed to the Apache Software Foundation (ASF) under one or more
     4  # contributor license agreements.  See the NOTICE file distributed with
     5  # this work for additional information regarding copyright ownership.
     6  # The ASF licenses this file to You under the Apache License, Version 2.0
     7  # (the "License"); you may not use this file except in compliance with
     8  # the License.  You may obtain a copy of the License at
     9  #
    10  #    http://www.apache.org/licenses/LICENSE-2.0
    11  #
    12  # Unless required by applicable law or agreed to in writing, software
    13  # distributed under the License is distributed on an "AS IS" BASIS,
    14  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15  # See the License for the specific language governing permissions and
    16  # limitations under the License.
    17  #
    18  
    19  """Unit tests for filesystem module."""
    20  # pytype: skip-file
    21  
    22  import bz2
    23  import gzip
    24  import logging
    25  import lzma
    26  import ntpath
    27  import os
    28  import posixpath
    29  import sys
    30  import tempfile
    31  import unittest
    32  import zlib
    33  from io import BytesIO
    34  
    35  import zstandard
    36  from parameterized import param
    37  from parameterized import parameterized
    38  
    39  from apache_beam.io.filesystem import CompressedFile
    40  from apache_beam.io.filesystem import CompressionTypes
    41  from apache_beam.io.filesystem import FileMetadata
    42  from apache_beam.io.filesystem import FileSystem
    43  
    44  
    45  class TestingFileSystem(FileSystem):
    46    def __init__(self, pipeline_options, has_dirs=False):
    47      super().__init__(pipeline_options)
    48      self._has_dirs = has_dirs
    49      self._files = {}
    50  
    51    @classmethod
    52    def scheme(cls):
    53      # Required for FileSystems.get_filesystem().
    54      return 'test'
    55  
    56    def join(self, basepath, *paths):
    57      raise NotImplementedError
    58  
    59    def split(self, path):
    60      raise NotImplementedError
    61  
    62    def mkdirs(self, path):
    63      raise NotImplementedError
    64  
    65    def has_dirs(self):
    66      return self._has_dirs
    67  
    68    def _insert_random_file(self, path, size):
    69      self._files[path] = size
    70  
    71    def _list(self, dir_or_prefix):
    72      for path, size in self._files.items():
    73        if path.startswith(dir_or_prefix):
    74          yield FileMetadata(path, size)
    75  
    76    def create(
    77        self,
    78        path,
    79        mime_type='application/octet-stream',
    80        compression_type=CompressionTypes.AUTO):
    81      raise NotImplementedError
    82  
    83    def open(
    84        self,
    85        path,
    86        mime_type='application/octet-stream',
    87        compression_type=CompressionTypes.AUTO):
    88      raise NotImplementedError
    89  
    90    def copy(self, source_file_names, destination_file_names):
    91      raise NotImplementedError
    92  
    93    def rename(self, source_file_names, destination_file_names):
    94      raise NotImplementedError
    95  
    96    def exists(self, path):
    97      raise NotImplementedError
    98  
    99    def size(self, path):
   100      raise NotImplementedError
   101  
   102    def last_updated(self, path):
   103      raise NotImplementedError
   104  
   105    def checksum(self, path):
   106      raise NotImplementedError
   107  
   108    def metadata(self, path):
   109      raise NotImplementedError
   110  
   111    def delete(self, paths):
   112      raise NotImplementedError
   113  
   114  
   115  class TestFileSystem(unittest.TestCase):
   116    def setUp(self):
   117      self.fs = TestingFileSystem(pipeline_options=None)
   118  
   119    def _flatten_match(self, match_results):
   120      return [
   121          file_metadata for match_result in match_results
   122          for file_metadata in match_result.metadata_list
   123      ]
   124  
   125    @parameterized.expand([
   126        ('gs://gcsio-test/**', all),
   127        # Does not match root-level files
   128        ('gs://gcsio-test/**/*', lambda n, i: n not in ['cat.png']),
   129        # Only matches root-level files
   130        ('gs://gcsio-test/*', [('cat.png', 19)]),
   131        (
   132            'gs://gcsio-test/cow/**', [
   133                ('cow/cat/fish', 2),
   134                ('cow/cat/blubber', 3),
   135                ('cow/dog/blubber', 4),
   136            ]),
   137        (
   138            'gs://gcsio-test/cow/ca**', [
   139                ('cow/cat/fish', 2),
   140                ('cow/cat/blubber', 3),
   141            ]),
   142        (
   143            'gs://gcsio-test/apple/[df]ish/ca*',
   144            [
   145                ('apple/fish/cat', 10),
   146                ('apple/fish/cart', 11),
   147                ('apple/fish/carl', 12),
   148                ('apple/dish/cat', 14),
   149                ('apple/dish/carl', 15),
   150            ]),
   151        (
   152            'gs://gcsio-test/apple/?ish/?a?',
   153            [
   154                ('apple/fish/cat', 10),
   155                ('apple/dish/bat', 13),
   156                ('apple/dish/cat', 14),
   157            ]),
   158        (
   159            'gs://gcsio-test/apple/fish/car?', [
   160                ('apple/fish/cart', 11),
   161                ('apple/fish/carl', 12),
   162            ]),
   163        (
   164            'gs://gcsio-test/apple/fish/b*',
   165            [
   166                ('apple/fish/blubber', 6),
   167                ('apple/fish/blowfish', 7),
   168                ('apple/fish/bambi', 8),
   169                ('apple/fish/balloon', 9),
   170            ]),
   171        (
   172            'gs://gcsio-test/apple/f*/b*',
   173            [
   174                ('apple/fish/blubber', 6),
   175                ('apple/fish/blowfish', 7),
   176                ('apple/fish/bambi', 8),
   177                ('apple/fish/balloon', 9),
   178            ]),
   179        (
   180            'gs://gcsio-test/apple/dish/[cb]at', [
   181                ('apple/dish/bat', 13),
   182                ('apple/dish/cat', 14),
   183            ]),
   184        (
   185            'gs://gcsio-test/banana/cyrano.m?', [
   186                ('banana/cyrano.md', 17),
   187                ('banana/cyrano.mb', 18),
   188            ]),
   189    ])
   190    def test_match_glob(self, file_pattern, expected_object_names):
   191      objects = [
   192          ('cow/cat/fish', 2), ('cow/cat/blubber', 3), ('cow/dog/blubber', 4),
   193          ('apple/dog/blubber', 5), ('apple/fish/blubber', 6),
   194          ('apple/fish/blowfish', 7), ('apple/fish/bambi', 8),
   195          ('apple/fish/balloon', 9), ('apple/fish/cat', 10),
   196          ('apple/fish/cart', 11), ('apple/fish/carl', 12),
   197          ('apple/dish/bat', 13), ('apple/dish/cat', 14), ('apple/dish/carl', 15),
   198          ('banana/cat', 16), ('banana/cyrano.md', 17), ('banana/cyrano.mb',
   199                                                         18), ('cat.png', 19)
   200      ]
   201      bucket_name = 'gcsio-test'
   202  
   203      if callable(expected_object_names):
   204        # A hack around the fact that the parameters do not have access to
   205        # the "objects" list.
   206  
   207        if expected_object_names is all:
   208          # It's a placeholder for "all" objects
   209          expected_object_names = objects
   210        else:
   211          # It's a filter function of type (str, int) -> bool
   212          # that returns true for expected objects
   213          filter_func = expected_object_names
   214          expected_object_names = [(short_path, size) for short_path,
   215                                   size in objects
   216                                   if filter_func(short_path, size)]
   217  
   218      for object_name, size in objects:
   219        file_name = 'gs://%s/%s' % (bucket_name, object_name)
   220        self.fs._insert_random_file(file_name, size)
   221  
   222      expected_file_names = [('gs://%s/%s' % (bucket_name, object_name), size)
   223                             for object_name,
   224                             size in expected_object_names]
   225      actual_file_names = [
   226          (file_metadata.path, file_metadata.size_in_bytes)
   227          for file_metadata in self._flatten_match(self.fs.match([file_pattern]))
   228      ]
   229  
   230      self.assertEqual(set(actual_file_names), set(expected_file_names))
   231  
   232      # Check if limits are followed correctly
   233      limit = 3
   234      expected_num_items = min(len(expected_object_names), limit)
   235      self.assertEqual(
   236          len(self._flatten_match(self.fs.match([file_pattern], [limit]))),
   237          expected_num_items)
   238  
   239    @parameterized.expand([
   240        param(
   241            os_path=posixpath,
   242            # re.escape does not escape forward slashes since Python 3.7
   243            # https://docs.python.org/3/whatsnew/3.7.html ("bpo-29995")
   244            sep_re='\\/' if sys.version_info < (3, 7, 0) else '/'),
   245        param(os_path=ntpath, sep_re='\\\\'),
   246    ])
   247    def test_translate_pattern(self, os_path, sep_re):
   248      star = r'[^/\\]*'
   249      double_star = r'.*'
   250      join = os_path.join
   251  
   252      sep = os_path.sep
   253      pattern__expected = [
   254          (join('a', '*'), sep_re.join(['a', star])),
   255          (join('b', '*') + sep, sep_re.join(['b', star]) + sep_re),
   256          (r'*[abc\]', star + r'[abc\\]'),
   257          (join('d', '**', '*'), sep_re.join(['d', double_star, star])),
   258      ]
   259      for pattern, expected in pattern__expected:
   260        expected = r'(?ms)' + expected + r'\Z'
   261        result = self.fs.translate_pattern(pattern)
   262        self.assertEqual(expected, result)
   263  
   264  
   265  class TestFileSystemWithDirs(TestFileSystem):
   266    def setUp(self):
   267      self.fs = TestingFileSystem(pipeline_options=None, has_dirs=True)
   268  
   269  
   270  class TestCompressedFile(unittest.TestCase):
   271    """Base class for TestCases that deals with TempDir clean-up.
   272  
   273    Inherited test cases will call self._new_tempdir() to start a temporary dir
   274    which will be deleted at the end of the tests (when tearDown() is called).
   275    """
   276  
   277    content = b"""- the BEAM -
   278  How things really are we would like to know.
   279  Does
   280       Time
   281            flow, is it elastic, or is it
   282  atomized in instants hammered around the
   283      clock's face? ...
   284  - May Swenson"""
   285  
   286    # Keep the read block size small so that we exercise the seek functionality
   287    # in compressed file and not just in the internal buffer
   288    read_block_size = 4
   289  
   290    def setUp(self):
   291      self._tempfiles = []
   292  
   293    def tearDown(self):
   294      for path in self._tempfiles:
   295        if os.path.exists(path):
   296          os.remove(path)
   297  
   298    def _create_temp_file(self):
   299      path = tempfile.NamedTemporaryFile(delete=False).name
   300      self._tempfiles.append(path)
   301      return path
   302  
   303    def _create_compressed_file(self, compression_type, content):
   304      file_name = self._create_temp_file()
   305  
   306      if compression_type == CompressionTypes.DEFLATE:
   307        with open(file_name, 'wb') as f:
   308          f.write(zlib.compress(content))
   309      elif compression_type == CompressionTypes.BZIP2 or \
   310              compression_type == CompressionTypes.GZIP:
   311        compress_open = bz2.BZ2File \
   312            if compression_type == CompressionTypes.BZIP2 \
   313            else gzip.open
   314        with compress_open(file_name, 'wb') as f:
   315          f.write(content)
   316      elif compression_type == CompressionTypes.ZSTD:
   317        compress_open = zstandard.open
   318        with compress_open(file_name, 'wb') as f:
   319          f.write(content)
   320      elif compression_type == CompressionTypes.LZMA:
   321        compress_open = lzma.open
   322        with compress_open(file_name, 'wb') as f:
   323          f.write(content)
   324      else:
   325        assert False, "Invalid compression type: %s" % compression_type
   326  
   327      return file_name
   328  
   329    def test_seekable_enabled_on_read(self):
   330      with open(self._create_temp_file(), 'rb') as f:
   331        readable = CompressedFile(f)
   332        self.assertTrue(readable.seekable)
   333  
   334    def test_seekable_disabled_on_write(self):
   335      with open(self._create_temp_file(), 'wb') as f:
   336        writeable = CompressedFile(f)
   337        self.assertFalse(writeable.seekable)
   338  
   339    def test_seekable_disabled_on_append(self):
   340      with open(self._create_temp_file(), 'ab') as f:
   341        writeable = CompressedFile(f)
   342        self.assertFalse(writeable.seekable)
   343  
   344    def test_seek_set(self):
   345      for compression_type in [CompressionTypes.BZIP2,
   346                               CompressionTypes.DEFLATE,
   347                               CompressionTypes.GZIP,
   348                               CompressionTypes.ZSTD,
   349                               CompressionTypes.LZMA]:
   350        file_name = self._create_compressed_file(compression_type, self.content)
   351        with open(file_name, 'rb') as f:
   352          compressed_fd = CompressedFile(
   353              f, compression_type, read_size=self.read_block_size)
   354          reference_fd = BytesIO(self.content)
   355  
   356          # Note: BytesIO's tell() reports out of bound positions (if we seek
   357          # beyond the file), therefore we need to cap it to max_position
   358          # _CompressedFile.tell() always stays within the bounds of the
   359          # uncompressed content.
   360          # Negative seek position argument is not supported for BytesIO with
   361          # whence set to SEEK_SET.
   362          for seek_position in (0,
   363                                1,
   364                                len(self.content) - 1,
   365                                len(self.content),
   366                                len(self.content) + 1):
   367            compressed_fd.seek(seek_position, os.SEEK_SET)
   368            reference_fd.seek(seek_position, os.SEEK_SET)
   369  
   370            uncompressed_line = compressed_fd.readline()
   371            reference_line = reference_fd.readline()
   372            self.assertEqual(uncompressed_line, reference_line)
   373  
   374            uncompressed_position = compressed_fd.tell()
   375            reference_position = reference_fd.tell()
   376            max_position = len(self.content)
   377            reference_position = min(reference_position, max_position)
   378            self.assertEqual(uncompressed_position, reference_position)
   379  
   380    def test_seek_cur(self):
   381      for compression_type in [CompressionTypes.BZIP2,
   382                               CompressionTypes.DEFLATE,
   383                               CompressionTypes.GZIP,
   384                               CompressionTypes.ZSTD,
   385                               CompressionTypes.LZMA]:
   386        file_name = self._create_compressed_file(compression_type, self.content)
   387        with open(file_name, 'rb') as f:
   388          compressed_fd = CompressedFile(
   389              f, compression_type, read_size=self.read_block_size)
   390          reference_fd = BytesIO(self.content)
   391  
   392          # Test out of bound, inbound seeking in both directions
   393          # Note: BytesIO's seek() reports out of bound positions (if we seek
   394          # beyond the file), therefore we need to cap it to max_position (to
   395          # make it consistent with the old StringIO behavior
   396          for seek_position in (-1,
   397                                0,
   398                                1,
   399                                len(self.content) // 2,
   400                                len(self.content) // 2,
   401                                -1 * len(self.content) // 2):
   402            compressed_fd.seek(seek_position, os.SEEK_CUR)
   403            reference_fd.seek(seek_position, os.SEEK_CUR)
   404  
   405            uncompressed_line = compressed_fd.readline()
   406            expected_line = reference_fd.readline()
   407            self.assertEqual(uncompressed_line, expected_line)
   408  
   409            reference_position = reference_fd.tell()
   410            uncompressed_position = compressed_fd.tell()
   411            max_position = len(self.content)
   412            reference_position = min(reference_position, max_position)
   413            reference_fd.seek(reference_position, os.SEEK_SET)
   414            self.assertEqual(uncompressed_position, reference_position)
   415  
   416    def test_read_from_end_returns_no_data(self):
   417      for compression_type in [CompressionTypes.BZIP2,
   418                               CompressionTypes.DEFLATE,
   419                               CompressionTypes.GZIP,
   420                               CompressionTypes.ZSTD,
   421                               CompressionTypes.LZMA]:
   422        file_name = self._create_compressed_file(compression_type, self.content)
   423        with open(file_name, 'rb') as f:
   424          compressed_fd = CompressedFile(
   425              f, compression_type, read_size=self.read_block_size)
   426  
   427          seek_position = 0
   428          compressed_fd.seek(seek_position, os.SEEK_END)
   429  
   430          expected_data = b''
   431          uncompressed_data = compressed_fd.read(10)
   432  
   433          self.assertEqual(uncompressed_data, expected_data)
   434  
   435    def test_seek_outside(self):
   436      for compression_type in [CompressionTypes.BZIP2,
   437                               CompressionTypes.DEFLATE,
   438                               CompressionTypes.GZIP,
   439                               CompressionTypes.ZSTD,
   440                               CompressionTypes.LZMA]:
   441        file_name = self._create_compressed_file(compression_type, self.content)
   442        with open(file_name, 'rb') as f:
   443          compressed_fd = CompressedFile(
   444              f, compression_type, read_size=self.read_block_size)
   445  
   446          for whence in (os.SEEK_CUR, os.SEEK_SET, os.SEEK_END):
   447            seek_position = -1 * len(self.content) - 10
   448            compressed_fd.seek(seek_position, whence)
   449  
   450            expected_position = 0
   451            uncompressed_position = compressed_fd.tell()
   452            self.assertEqual(uncompressed_position, expected_position)
   453  
   454            seek_position = len(self.content) + 20
   455            compressed_fd.seek(seek_position, whence)
   456  
   457            expected_position = len(self.content)
   458            uncompressed_position = compressed_fd.tell()
   459            self.assertEqual(uncompressed_position, expected_position)
   460  
   461    def test_read_and_seek_back_to_beginning(self):
   462      for compression_type in [CompressionTypes.BZIP2,
   463                               CompressionTypes.DEFLATE,
   464                               CompressionTypes.GZIP,
   465                               CompressionTypes.ZSTD,
   466                               CompressionTypes.LZMA]:
   467        file_name = self._create_compressed_file(compression_type, self.content)
   468        with open(file_name, 'rb') as f:
   469          compressed_fd = CompressedFile(
   470              f, compression_type, read_size=self.read_block_size)
   471  
   472          first_pass = compressed_fd.readline()
   473          compressed_fd.seek(0, os.SEEK_SET)
   474          second_pass = compressed_fd.readline()
   475  
   476          self.assertEqual(first_pass, second_pass)
   477  
   478    def test_tell(self):
   479      lines = [b'line%d\n' % i for i in range(10)]
   480      tmpfile = self._create_temp_file()
   481      with open(tmpfile, 'wb') as f:
   482        writeable = CompressedFile(f)
   483        current_offset = 0
   484        for line in lines:
   485          writeable.write(line)
   486          current_offset += len(line)
   487          self.assertEqual(current_offset, writeable.tell())
   488  
   489      with open(tmpfile, 'rb') as f:
   490        readable = CompressedFile(f)
   491        current_offset = 0
   492        while True:
   493          line = readable.readline()
   494          current_offset += len(line)
   495          self.assertEqual(current_offset, readable.tell())
   496          if not line:
   497            break
   498  
   499    def test_concatenated_compressed_file(self):
   500      # The test apache_beam.io.textio_test.test_read_gzip_concat
   501      # does not encounter the problem in the Beam 2.13 and earlier
   502      # code base because the test data is too small: the data is
   503      # smaller than read_size, so it goes through logic in the code
   504      # that avoids the problem in the code.  So, this test sets
   505      # read_size smaller and test data bigger, in order to
   506      # encounter the problem. It would be difficult to test in the
   507      # textio_test module, because you'd need very large test data
   508      # because default read_size is 16MiB, and the ReadFromText
   509      # interface does not allow you to modify the read_size.
   510      import random
   511      import threading
   512      from six import int2byte
   513      num_test_lines = 10
   514      timeout = 30
   515      read_size = (64 << 10)  # set much smaller than the line size
   516      byte_table = tuple(int2byte(i) for i in range(32, 96))
   517  
   518      def generate_random_line():
   519        byte_list = list(
   520            b for i in range(4096) for b in random.sample(byte_table, 64))
   521        byte_list.append(b'\n')
   522        return b''.join(byte_list)
   523  
   524      def create_test_file(compression_type, lines):
   525        filenames = []
   526        file_name = self._create_temp_file()
   527        if compression_type == CompressionTypes.BZIP2:
   528          compress_factory = bz2.BZ2File
   529        elif compression_type == CompressionTypes.GZIP:
   530          compress_factory = gzip.open
   531        elif compression_type == CompressionTypes.ZSTD:
   532          compress_factory = zstandard.open
   533        elif compression_type == CompressionTypes.LZMA:
   534          compress_factory = lzma.open
   535        else:
   536          assert False, "Invalid compression type: %s" % compression_type
   537        for line in lines:
   538          filenames.append(self._create_temp_file())
   539          with compress_factory(filenames[-1], 'wb') as f:
   540            f.write(line)
   541        with open(file_name, 'wb') as o:
   542          for name in filenames:
   543            with open(name, 'rb') as i:
   544              o.write(i.read())
   545        return file_name
   546  
   547      # I remember some time ago when a job ran with a real concatenated
   548      # gzip file, I got into an endless loop in the beam filesystem module.
   549      # That's why I put this handler in to trap an endless loop. However,
   550      # this unit test doesn't encounter an endless loop, it encounters a
   551      # different error, in the Beam 2.13 and earlier implementation.
   552      # So it's not strictly necessary to have this handler in this unit test.
   553  
   554      def timeout_handler():
   555        raise IOError('Exiting due to likley infinite loop logic in code.')
   556  
   557      timer = threading.Timer(timeout, timeout_handler)
   558      try:
   559        test_lines = tuple(generate_random_line() for i in range(num_test_lines))
   560        for compression_type in [CompressionTypes.BZIP2,
   561                                 CompressionTypes.GZIP,
   562                                 CompressionTypes.ZSTD,
   563                                 CompressionTypes.LZMA]:
   564          file_name = create_test_file(compression_type, test_lines)
   565          timer.start()
   566          with open(file_name, 'rb') as f:
   567            data = CompressedFile(f, compression_type, read_size=read_size)
   568            for written_line in test_lines:
   569              read_line = data.readline()
   570              self.assertEqual(written_line, read_line)
   571          timer.cancel()
   572          # Starting a new timer for the next iteration/test.
   573          timer = threading.Timer(timeout, timeout_handler)
   574      finally:
   575        timer.cancel()
   576  
   577  
   578  if __name__ == '__main__':
   579    logging.getLogger().setLevel(logging.INFO)
   580    unittest.main()