github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/gcsfilesystem_test.py (about)

     1  # -*- coding: utf-8 -*-
     2  #
     3  # Licensed to the Apache Software Foundation (ASF) under one or more
     4  # contributor license agreements.  See the NOTICE file distributed with
     5  # this work for additional information regarding copyright ownership.
     6  # The ASF licenses this file to You under the Apache License, Version 2.0
     7  # (the "License"); you may not use this file except in compliance with
     8  # the License.  You may obtain a copy of the License at
     9  #
    10  #    http://www.apache.org/licenses/LICENSE-2.0
    11  #
    12  # Unless required by applicable law or agreed to in writing, software
    13  # distributed under the License is distributed on an "AS IS" BASIS,
    14  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15  # See the License for the specific language governing permissions and
    16  # limitations under the License.
    17  #
    18  
    19  """Unit tests for GCS File System."""
    20  
    21  # pytype: skip-file
    22  
    23  import logging
    24  import unittest
    25  
    26  import mock
    27  
    28  from apache_beam.io.filesystem import BeamIOError
    29  from apache_beam.io.filesystem import FileMetadata
    30  from apache_beam.options.pipeline_options import PipelineOptions
    31  
    32  # Protect against environments where apitools library is not available.
    33  # pylint: disable=wrong-import-order, wrong-import-position
    34  try:
    35    from apache_beam.io.gcp import gcsfilesystem
    36  except ImportError:
    37    gcsfilesystem = None  # type: ignore
    38  # pylint: enable=wrong-import-order, wrong-import-position
    39  
    40  
    41  @unittest.skipIf(gcsfilesystem is None, 'GCP dependencies are not installed')
    42  class GCSFileSystemTest(unittest.TestCase):
    43    def setUp(self):
    44      pipeline_options = PipelineOptions()
    45      self.fs = gcsfilesystem.GCSFileSystem(pipeline_options=pipeline_options)
    46  
    47    def test_scheme(self):
    48      self.assertEqual(self.fs.scheme(), 'gs')
    49      self.assertEqual(gcsfilesystem.GCSFileSystem.scheme(), 'gs')
    50  
    51    def test_join(self):
    52      self.assertEqual(
    53          'gs://bucket/path/to/file',
    54          self.fs.join('gs://bucket/path', 'to', 'file'))
    55      self.assertEqual(
    56          'gs://bucket/path/to/file', self.fs.join('gs://bucket/path', 'to/file'))
    57      self.assertEqual(
    58          'gs://bucket/path/to/file',
    59          self.fs.join('gs://bucket/path', '/to/file'))
    60      self.assertEqual(
    61          'gs://bucket/path/to/file',
    62          self.fs.join('gs://bucket/path/', 'to', 'file'))
    63      self.assertEqual(
    64          'gs://bucket/path/to/file',
    65          self.fs.join('gs://bucket/path/', 'to/file'))
    66      self.assertEqual(
    67          'gs://bucket/path/to/file',
    68          self.fs.join('gs://bucket/path/', '/to/file'))
    69      with self.assertRaises(ValueError):
    70        self.fs.join('/bucket/path/', '/to/file')
    71  
    72    def test_split(self):
    73      self.assertEqual(('gs://foo/bar', 'baz'), self.fs.split('gs://foo/bar/baz'))
    74      self.assertEqual(('gs://foo', ''), self.fs.split('gs://foo/'))
    75      self.assertEqual(('gs://foo', ''), self.fs.split('gs://foo'))
    76  
    77      with self.assertRaises(ValueError):
    78        self.fs.split('/no/gcs/prefix')
    79  
    80    @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio')
    81    def test_match_multiples(self, mock_gcsio):
    82      # Prepare mocks.
    83      gcsio_mock = mock.MagicMock()
    84      gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock
    85      gcsio_mock.list_files.return_value = iter([
    86          ('gs://bucket/file1', (1, 99999.0)),
    87          ('gs://bucket/file2', (2, 88888.0))
    88      ])
    89      expected_results = set([
    90          FileMetadata('gs://bucket/file1', 1, 99999.0),
    91          FileMetadata('gs://bucket/file2', 2, 88888.0)
    92      ])
    93      match_result = self.fs.match(['gs://bucket/'])[0]
    94      self.assertEqual(set(match_result.metadata_list), expected_results)
    95      gcsio_mock.list_files.assert_called_once_with(
    96          'gs://bucket/', with_metadata=True)
    97  
    98    @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio')
    99    def test_match_multiples_limit(self, mock_gcsio):
   100      # Prepare mocks.
   101      gcsio_mock = mock.MagicMock()
   102      limit = 1
   103      gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock
   104      gcsio_mock.list_files.return_value = iter([
   105          ('gs://bucket/file1', (1, 99999.0))
   106      ])
   107      expected_results = set([FileMetadata('gs://bucket/file1', 1, 99999.0)])
   108      match_result = self.fs.match(['gs://bucket/'], [limit])[0]
   109      self.assertEqual(set(match_result.metadata_list), expected_results)
   110      self.assertEqual(len(match_result.metadata_list), limit)
   111      gcsio_mock.list_files.assert_called_once_with(
   112          'gs://bucket/', with_metadata=True)
   113  
   114    @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio')
   115    def test_match_multiples_error(self, mock_gcsio):
   116      # Prepare mocks.
   117      gcsio_mock = mock.MagicMock()
   118      gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock
   119      exception = IOError('Failed')
   120      gcsio_mock.list_files.side_effect = exception
   121  
   122      with self.assertRaisesRegex(BeamIOError,
   123                                  r'^Match operation failed') as error:
   124        self.fs.match(['gs://bucket/'])
   125      self.assertRegex(
   126          str(error.exception.exception_details), r'gs://bucket/.*%s' % exception)
   127      gcsio_mock.list_files.assert_called_once_with(
   128          'gs://bucket/', with_metadata=True)
   129  
   130    @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio')
   131    def test_match_multiple_patterns(self, mock_gcsio):
   132      # Prepare mocks.
   133      gcsio_mock = mock.MagicMock()
   134      gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock
   135      gcsio_mock.list_files.side_effect = [
   136          iter([('gs://bucket/file1', (1, 99999.0))]),
   137          iter([('gs://bucket/file2', (2, 88888.0))]),
   138      ]
   139      expected_results = [[FileMetadata('gs://bucket/file1', 1, 99999.0)],
   140                          [FileMetadata('gs://bucket/file2', 2, 88888.0)]]
   141      result = self.fs.match(['gs://bucket/file1*', 'gs://bucket/file2*'])
   142      self.assertEqual([mr.metadata_list for mr in result], expected_results)
   143  
   144    @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio')
   145    def test_create(self, mock_gcsio):
   146      # Prepare mocks.
   147      gcsio_mock = mock.MagicMock()
   148      gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock
   149      # Issue file copy
   150      _ = self.fs.create('gs://bucket/from1', 'application/octet-stream')
   151  
   152      gcsio_mock.open.assert_called_once_with(
   153          'gs://bucket/from1', 'wb', mime_type='application/octet-stream')
   154  
   155    @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio')
   156    def test_open(self, mock_gcsio):
   157      # Prepare mocks.
   158      gcsio_mock = mock.MagicMock()
   159      gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock
   160      # Issue file copy
   161      _ = self.fs.open('gs://bucket/from1', 'application/octet-stream')
   162  
   163      gcsio_mock.open.assert_called_once_with(
   164          'gs://bucket/from1', 'rb', mime_type='application/octet-stream')
   165  
   166    @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio')
   167    def test_copy_file(self, mock_gcsio):
   168      # Prepare mocks.
   169      gcsio_mock = mock.MagicMock()
   170      gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock
   171      sources = ['gs://bucket/from1']
   172      destinations = ['gs://bucket/to1']
   173  
   174      # Issue file copy
   175      self.fs.copy(sources, destinations)
   176  
   177      gcsio_mock.copy.assert_called_once_with(
   178          'gs://bucket/from1', 'gs://bucket/to1')
   179  
   180    @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio')
   181    def test_copy_file_error(self, mock_gcsio):
   182      # Prepare mocks.
   183      gcsio_mock = mock.MagicMock()
   184      gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock
   185      sources = ['gs://bucket/from1']
   186      destinations = ['gs://bucket/to1']
   187  
   188      exception = IOError('Failed')
   189      gcsio_mock.copy.side_effect = exception
   190  
   191      # Issue batch rename.
   192      expected_results = {
   193          (s, d): exception
   194          for s, d in zip(sources, destinations)
   195      }
   196  
   197      # Issue batch copy.
   198      with self.assertRaisesRegex(BeamIOError,
   199                                  r'^Copy operation failed') as error:
   200        self.fs.copy(sources, destinations)
   201      self.assertEqual(error.exception.exception_details, expected_results)
   202  
   203      gcsio_mock.copy.assert_called_once_with(
   204          'gs://bucket/from1', 'gs://bucket/to1')
   205  
   206    @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio')
   207    def test_copy_tree(self, mock_gcsio):
   208      # Prepare mocks.
   209      gcsio_mock = mock.MagicMock()
   210      gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock
   211      sources = ['gs://bucket1/']
   212      destinations = ['gs://bucket2/']
   213  
   214      # Issue directory copy
   215      self.fs.copy(sources, destinations)
   216  
   217      gcsio_mock.copytree.assert_called_once_with(
   218          'gs://bucket1/', 'gs://bucket2/')
   219  
   220    @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio')
   221    def test_rename(self, mock_gcsio):
   222      # Prepare mocks.
   223      gcsio_mock = mock.MagicMock()
   224      gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock
   225      sources = [
   226          'gs://bucket/from1',
   227          'gs://bucket/from2',
   228          'gs://bucket/from3',
   229      ]
   230      destinations = [
   231          'gs://bucket/to1',
   232          'gs://bucket/to2',
   233          'gs://bucket/to3',
   234      ]
   235      gcsio_mock.copy_batch.side_effect = [[
   236          ('gs://bucket/from1', 'gs://bucket/to1', None),
   237          ('gs://bucket/from2', 'gs://bucket/to2', None),
   238          ('gs://bucket/from3', 'gs://bucket/to3', None),
   239      ]]
   240      gcsio_mock.delete_batch.side_effect = [[
   241          ('gs://bucket/from1', None),
   242          ('gs://bucket/from2', None),
   243          ('gs://bucket/from3', None),
   244      ]]
   245  
   246      # Issue batch rename.
   247      self.fs.rename(sources, destinations)
   248  
   249      gcsio_mock.copy_batch.assert_called_once_with([
   250          ('gs://bucket/from1', 'gs://bucket/to1'),
   251          ('gs://bucket/from2', 'gs://bucket/to2'),
   252          ('gs://bucket/from3', 'gs://bucket/to3'),
   253      ])
   254      gcsio_mock.delete_batch.assert_called_once_with([
   255          'gs://bucket/from1',
   256          'gs://bucket/from2',
   257          'gs://bucket/from3',
   258      ])
   259  
   260    @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio')
   261    def test_rename_error(self, mock_gcsio):
   262      # Prepare mocks.
   263      gcsio_mock = mock.MagicMock()
   264      gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock
   265      sources = [
   266          'gs://bucket/from1',
   267          'gs://bucket/from2',
   268          'gs://bucket/from3',
   269      ]
   270      destinations = [
   271          'gs://bucket/to1',
   272          'gs://bucket/to2',
   273          'gs://bucket/to3',
   274      ]
   275      exception = IOError('Failed')
   276      gcsio_mock.delete_batch.side_effect = [[(f, exception) for f in sources]]
   277      gcsio_mock.copy_batch.side_effect = [[
   278          ('gs://bucket/from1', 'gs://bucket/to1', None),
   279          ('gs://bucket/from2', 'gs://bucket/to2', None),
   280          ('gs://bucket/from3', 'gs://bucket/to3', None),
   281      ]]
   282  
   283      # Issue batch rename.
   284      expected_results = {
   285          (s, d): exception
   286          for s, d in zip(sources, destinations)
   287      }
   288  
   289      # Issue batch rename.
   290      with self.assertRaisesRegex(BeamIOError,
   291                                  r'^Rename operation failed') as error:
   292        self.fs.rename(sources, destinations)
   293      self.assertEqual(error.exception.exception_details, expected_results)
   294  
   295      gcsio_mock.copy_batch.assert_called_once_with([
   296          ('gs://bucket/from1', 'gs://bucket/to1'),
   297          ('gs://bucket/from2', 'gs://bucket/to2'),
   298          ('gs://bucket/from3', 'gs://bucket/to3'),
   299      ])
   300      gcsio_mock.delete_batch.assert_called_once_with([
   301          'gs://bucket/from1',
   302          'gs://bucket/from2',
   303          'gs://bucket/from3',
   304      ])
   305  
   306    @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio')
   307    def test_delete(self, mock_gcsio):
   308      # Prepare mocks.
   309      gcsio_mock = mock.MagicMock()
   310      gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock
   311      gcsio_mock._status.return_value = {'size': 0, 'last_updated': 99999.0}
   312      files = [
   313          'gs://bucket/from1',
   314          'gs://bucket/from2',
   315          'gs://bucket/from3',
   316      ]
   317  
   318      # Issue batch delete.
   319      self.fs.delete(files)
   320      gcsio_mock.delete_batch.assert_called()
   321  
   322    @mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio')
   323    def test_delete_error(self, mock_gcsio):
   324      # Prepare mocks.
   325      gcsio_mock = mock.MagicMock()
   326      gcsfilesystem.gcsio.GcsIO = lambda pipeline_options=None: gcsio_mock
   327      exception = IOError('Failed')
   328      gcsio_mock.delete_batch.side_effect = exception
   329      gcsio_mock._status.return_value = {'size': 0, 'last_updated': 99999.0}
   330      files = [
   331          'gs://bucket/from1',
   332          'gs://bucket/from2',
   333          'gs://bucket/from3',
   334      ]
   335      expected_results = {f: exception for f in files}
   336  
   337      # Issue batch delete.
   338      with self.assertRaisesRegex(BeamIOError,
   339                                  r'^Delete operation failed') as error:
   340        self.fs.delete(files)
   341      self.assertEqual(error.exception.exception_details, expected_results)
   342      gcsio_mock.delete_batch.assert_called()
   343  
   344  
   345  if __name__ == '__main__':
   346    logging.getLogger().setLevel(logging.INFO)
   347    unittest.main()