github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filebasedsink_test.py (about)

     1  # -*- coding: utf-8 -*-
     2  #
     3  # Licensed to the Apache Software Foundation (ASF) under one or more
     4  # contributor license agreements.  See the NOTICE file distributed with
     5  # this work for additional information regarding copyright ownership.
     6  # The ASF licenses this file to You under the Apache License, Version 2.0
     7  # (the "License"); you may not use this file except in compliance with
     8  # the License.  You may obtain a copy of the License at
     9  #
    10  #    http://www.apache.org/licenses/LICENSE-2.0
    11  #
    12  # Unless required by applicable law or agreed to in writing, software
    13  # distributed under the License is distributed on an "AS IS" BASIS,
    14  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15  # See the License for the specific language governing permissions and
    16  # limitations under the License.
    17  #
    18  
    19  """Unit tests for file sinks."""
    20  
    21  # pytype: skip-file
    22  
    23  import glob
    24  import logging
    25  import os
    26  import shutil
    27  import tempfile
    28  import unittest
    29  
    30  import hamcrest as hc
    31  import mock
    32  
    33  import apache_beam as beam
    34  from apache_beam.coders import coders
    35  from apache_beam.io import filebasedsink
    36  from apache_beam.io.filesystem import BeamIOError
    37  from apache_beam.options.value_provider import StaticValueProvider
    38  from apache_beam.testing.test_pipeline import TestPipeline
    39  from apache_beam.transforms.display import DisplayData
    40  from apache_beam.transforms.display_test import DisplayDataItemMatcher
    41  
    42  _LOGGER = logging.getLogger(__name__)
    43  
    44  
    45  # TODO: Refactor code so all io tests are using same library
    46  # TestCaseWithTempDirCleanup class.
    47  class _TestCaseWithTempDirCleanUp(unittest.TestCase):
    48    """Base class for TestCases that deals with TempDir clean-up.
    49  
    50    Inherited test cases will call self._new_tempdir() to start a temporary dir
    51    which will be deleted at the end of the tests (when tearDown() is called).
    52    """
    53    def setUp(self):
    54      self._tempdirs = []
    55  
    56    def tearDown(self):
    57      for path in self._tempdirs:
    58        if os.path.exists(path):
    59          shutil.rmtree(path)
    60      self._tempdirs = []
    61  
    62    def _new_tempdir(self):
    63      result = tempfile.mkdtemp()
    64      self._tempdirs.append(result)
    65      return result
    66  
    67    def _create_temp_file(self, name='', suffix='', dir=None, content=None):
    68      if not name:
    69        name = tempfile.template
    70      if not dir:
    71        dir = self._new_tempdir()
    72      file_name = tempfile.NamedTemporaryFile(
    73          delete=False, prefix=name, dir=dir, suffix=suffix).name
    74  
    75      if content:
    76        with open(file_name, 'w') as f:
    77          f.write(content)
    78      return file_name
    79  
    80  
    81  class MyFileBasedSink(filebasedsink.FileBasedSink):
    82    def open(self, temp_path):
    83      # TODO: Fix main session pickling.
    84      # file_handle = super().open(temp_path)
    85      file_handle = filebasedsink.FileBasedSink.open(self, temp_path)
    86      file_handle.write(b'[start]')
    87      return file_handle
    88  
    89    def write_encoded_record(self, file_handle, encoded_value):
    90      file_handle.write(b'[')
    91      file_handle.write(encoded_value)
    92      file_handle.write(b']')
    93  
    94    def close(self, file_handle):
    95      file_handle.write(b'[end]')
    96      # TODO: Fix main session pickling.
    97      # file_handle = super().close(file_handle)
    98      file_handle = filebasedsink.FileBasedSink.close(self, file_handle)
    99  
   100  
   101  class TestFileBasedSink(_TestCaseWithTempDirCleanUp):
   102    def _common_init(self, sink):
   103      # Manually invoke the generic Sink API.
   104      init_token = sink.initialize_write()
   105  
   106      writer1 = sink.open_writer(init_token, '1')
   107      writer1.write('a')
   108      writer1.write('b')
   109      res1 = writer1.close()
   110  
   111      writer2 = sink.open_writer(init_token, '2')
   112      writer2.write('x')
   113      writer2.write('y')
   114      writer2.write('z')
   115      res2 = writer2.close()
   116  
   117      return init_token, [res1, res2]
   118  
   119    def test_file_sink_writing(self):
   120      temp_path = os.path.join(self._new_tempdir(), 'FileBasedSink')
   121      sink = MyFileBasedSink(
   122          temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder())
   123  
   124      init_token, writer_results = self._common_init(sink)
   125  
   126      pre_finalize_results = sink.pre_finalize(init_token, writer_results)
   127      finalize_res1 = list(
   128          sink.finalize_write(init_token, writer_results, pre_finalize_results))
   129      # Retry the finalize operation (as if the first attempt was lost).
   130      finalize_res2 = list(
   131          sink.finalize_write(init_token, writer_results, pre_finalize_results))
   132  
   133      # Check the results.
   134      shard1 = temp_path + '-00000-of-00002.output'
   135      shard2 = temp_path + '-00001-of-00002.output'
   136      self.assertEqual(finalize_res1, [shard1, shard2])
   137      self.assertEqual(finalize_res2, [])
   138      self.assertEqual(open(shard1).read(), '[start][a][b][end]')
   139      self.assertEqual(open(shard2).read(), '[start][x][y][z][end]')
   140  
   141      # Check that any temp files are deleted.
   142      self.assertCountEqual([shard1, shard2], glob.glob(temp_path + '*'))
   143  
   144    def test_file_sink_display_data(self):
   145      temp_path = os.path.join(self._new_tempdir(), 'display')
   146      sink = MyFileBasedSink(
   147          temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder())
   148      dd = DisplayData.create_from(sink)
   149      expected_items = [
   150          DisplayDataItemMatcher('compression', 'auto'),
   151          DisplayDataItemMatcher(
   152              'file_pattern',
   153              '{}{}'.format(
   154                  temp_path, '-%(shard_num)05d-of-%(num_shards)05d.output'))
   155      ]
   156      hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
   157  
   158    def test_empty_write(self):
   159      temp_path = tempfile.NamedTemporaryFile().name
   160      sink = MyFileBasedSink(
   161          temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder())
   162      with TestPipeline() as p:
   163        p | beam.Create([]) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned
   164      self.assertEqual(
   165          open(temp_path + '-00000-of-00001.output').read(), '[start][end]')
   166  
   167    def test_static_value_provider_empty_write(self):
   168      temp_path = StaticValueProvider(
   169          value_type=str, value=tempfile.NamedTemporaryFile().name)
   170      sink = MyFileBasedSink(
   171          temp_path,
   172          file_name_suffix=StaticValueProvider(value_type=str, value='.output'),
   173          coder=coders.ToBytesCoder())
   174      with TestPipeline() as p:
   175        p | beam.Create([]) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned
   176      self.assertEqual(
   177          open(temp_path.get() + '-00000-of-00001.output').read(), '[start][end]')
   178  
   179    def test_fixed_shard_write(self):
   180      temp_path = os.path.join(self._new_tempdir(), 'empty')
   181      sink = MyFileBasedSink(
   182          temp_path,
   183          file_name_suffix='.output',
   184          num_shards=3,
   185          shard_name_template='_NN_SSS_',
   186          coder=coders.ToBytesCoder())
   187      with TestPipeline() as p:
   188        p | beam.Create(['a', 'b']) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned
   189  
   190      concat = ''.join(
   191          open(temp_path + '_03_%03d_.output' % shard_num).read()
   192          for shard_num in range(3))
   193      self.assertTrue('][a][' in concat, concat)
   194      self.assertTrue('][b][' in concat, concat)
   195  
   196    # Not using 'test' in name so that 'nose' doesn't pick this as a test.
   197    def run_temp_dir_check(
   198        self,
   199        no_dir_path,
   200        dir_path,
   201        no_dir_root_path,
   202        dir_root_path,
   203        prefix,
   204        separator):
   205      def _get_temp_dir(file_path_prefix):
   206        sink = MyFileBasedSink(
   207            file_path_prefix,
   208            file_name_suffix='.output',
   209            coder=coders.ToBytesCoder())
   210        return sink.initialize_write()
   211  
   212      temp_dir = _get_temp_dir(no_dir_path)
   213      self.assertTrue(temp_dir.startswith(prefix))
   214      last_sep = temp_dir.rfind(separator)
   215      self.assertTrue(temp_dir[last_sep + 1:].startswith('beam-temp'))
   216  
   217      temp_dir = _get_temp_dir(dir_path)
   218      self.assertTrue(temp_dir.startswith(prefix))
   219      last_sep = temp_dir.rfind(separator)
   220      self.assertTrue(temp_dir[last_sep + 1:].startswith('beam-temp'))
   221  
   222      with self.assertRaises(ValueError):
   223        _get_temp_dir(no_dir_root_path)
   224  
   225      with self.assertRaises(ValueError):
   226        _get_temp_dir(dir_root_path)
   227  
   228    def test_temp_dir_uniqueness(self):
   229      temp_path = os.path.join(self._new_tempdir(), 'unique')
   230      sink = MyFileBasedSink(temp_path, coder=coders.ToBytesCoder())
   231      init_list = [''] * 1000
   232      temp_dir_list = [sink._create_temp_dir(temp_path) for _ in init_list]
   233      temp_dir_set = set(temp_dir_list)
   234      self.assertEqual(len(temp_dir_list), len(temp_dir_set))
   235  
   236    def test_temp_dir_gcs(self):
   237      try:
   238        self.run_temp_dir_check(
   239            'gs://aaa/bbb',
   240            'gs://aaa/bbb/',
   241            'gs://aaa',
   242            'gs://aaa/',
   243            'gs://',
   244            '/')
   245      except ValueError:
   246        _LOGGER.debug('Ignoring test since GCP module is not installed')
   247  
   248    @mock.patch('apache_beam.io.localfilesystem.os')
   249    def test_temp_dir_local(self, filesystem_os_mock):
   250      # Here we test a unix-like mock file-system
   251      # (not really testing Unix or Windows since we mock the function of 'os'
   252      # module).
   253  
   254      def _fake_unix_split(path):
   255        sep = path.rfind('/')
   256        if sep < 0:
   257          raise ValueError('Path must contain a separator')
   258        return (path[:sep], path[sep + 1:])
   259  
   260      def _fake_unix_join(base, path):
   261        return base + '/' + path
   262  
   263      filesystem_os_mock.path.abspath = lambda a: a
   264      filesystem_os_mock.path.split.side_effect = _fake_unix_split
   265      filesystem_os_mock.path.join.side_effect = _fake_unix_join
   266      self.run_temp_dir_check('/aaa/bbb', '/aaa/bbb/', '/', '/', '/', '/')
   267  
   268    def test_file_sink_multi_shards(self):
   269      temp_path = os.path.join(self._new_tempdir(), 'multishard')
   270      sink = MyFileBasedSink(
   271          temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder())
   272  
   273      # Manually invoke the generic Sink API.
   274      init_token = sink.initialize_write()
   275  
   276      num_shards = 1000
   277      writer_results = []
   278      for i in range(num_shards):
   279        uuid = 'uuid-%05d' % i
   280        writer = sink.open_writer(init_token, uuid)
   281        writer.write('a')
   282        writer.write('b')
   283        writer.write(uuid)
   284        writer_results.append(writer.close())
   285  
   286      pre_finalize_results = sink.pre_finalize(init_token, writer_results)
   287      res = sorted(
   288          sink.finalize_write(init_token, writer_results, pre_finalize_results))
   289  
   290      for i in range(num_shards):
   291        shard_name = '%s-%05d-of-%05d.output' % (temp_path, i, num_shards)
   292        uuid = 'uuid-%05d' % i
   293        self.assertEqual(res[i], shard_name)
   294        self.assertEqual(
   295            open(shard_name).read(), ('[start][a][b][%s][end]' % uuid))
   296  
   297      # Check that any temp files are deleted.
   298      self.assertCountEqual(res, glob.glob(temp_path + '*'))
   299  
   300    @mock.patch.object(filebasedsink.FileSystems, 'rename')
   301    def test_file_sink_rename_error(self, rename_mock):
   302      temp_path = os.path.join(self._new_tempdir(), 'rename_error')
   303      sink = MyFileBasedSink(
   304          temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder())
   305      init_token, writer_results = self._common_init(sink)
   306      pre_finalize_results = sink.pre_finalize(init_token, writer_results)
   307  
   308      error_str = 'mock rename error description'
   309      rename_mock.side_effect = BeamIOError(
   310          'mock rename error', {('src', 'dst'): error_str})
   311      with self.assertRaisesRegex(Exception, error_str):
   312        list(
   313            sink.finalize_write(init_token, writer_results, pre_finalize_results))
   314  
   315    def test_file_sink_src_missing(self):
   316      temp_path = os.path.join(self._new_tempdir(), 'src_missing')
   317      sink = MyFileBasedSink(
   318          temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder())
   319      init_token, writer_results = self._common_init(sink)
   320      pre_finalize_results = sink.pre_finalize(init_token, writer_results)
   321  
   322      os.remove(writer_results[0])
   323      with self.assertRaisesRegex(Exception, r'not exist'):
   324        list(
   325            sink.finalize_write(init_token, writer_results, pre_finalize_results))
   326  
   327    def test_file_sink_dst_matches_src(self):
   328      temp_path = os.path.join(self._new_tempdir(), 'dst_matches_src')
   329      sink = MyFileBasedSink(
   330          temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder())
   331      init_token, [res1, res2] = self._common_init(sink)
   332  
   333      pre_finalize_results = sink.pre_finalize(init_token, [res1, res2])
   334      list(sink.finalize_write(init_token, [res1, res2], pre_finalize_results))
   335  
   336      self.assertFalse(os.path.exists(res1))
   337      self.assertFalse(os.path.exists(res2))
   338      shard1 = temp_path + '-00000-of-00002.output'
   339      shard2 = temp_path + '-00001-of-00002.output'
   340      self.assertEqual(open(shard1).read(), '[start][a][b][end]')
   341      self.assertEqual(open(shard2).read(), '[start][x][y][z][end]')
   342  
   343      os.makedirs(os.path.dirname(res1))
   344      shutil.copyfile(shard1, res1)
   345      shutil.copyfile(shard2, res2)
   346      list(sink.finalize_write(init_token, [res1, res2], pre_finalize_results))
   347  
   348    def test_pre_finalize(self):
   349      temp_path = os.path.join(self._new_tempdir(), 'pre_finalize')
   350      sink = MyFileBasedSink(
   351          temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder())
   352      init_token, [res1, res2] = self._common_init(sink)
   353  
   354      # no-op
   355      sink.pre_finalize(init_token, [res1, res2])
   356  
   357      # Create finalized outputs from a previous run, which pre_finalize should
   358      # delete.
   359      shard1 = temp_path + '-00000-of-00002.output'
   360      shard2 = temp_path + '-00001-of-00002.output'
   361      with open(shard1, 'w') as f:
   362        f.write('foo')
   363      with open(shard2, 'w') as f:
   364        f.write('foo')
   365      self.assertTrue(os.path.exists(res1))
   366      self.assertTrue(os.path.exists(res2))
   367      self.assertTrue(os.path.exists(shard1))
   368      self.assertTrue(os.path.exists(shard2))
   369  
   370      sink.pre_finalize(init_token, [res1, res2])
   371      self.assertTrue(os.path.exists(res1))
   372      self.assertTrue(os.path.exists(res2))
   373      self.assertFalse(os.path.exists(shard1))
   374      self.assertFalse(os.path.exists(shard2))
   375  
   376    @mock.patch.object(filebasedsink.FileSystems, 'delete')
   377    def test_pre_finalize_error(self, delete_mock):
   378      temp_path = os.path.join(self._new_tempdir(), 'pre_finalize')
   379      sink = MyFileBasedSink(
   380          temp_path, file_name_suffix='.output', coder=coders.ToBytesCoder())
   381      init_token, [res1, res2] = self._common_init(sink)
   382  
   383      # no-op
   384      sink.pre_finalize(init_token, [res1, res2])
   385  
   386      # Create finalized outputs from a previous run, which pre_finalize should
   387      # delete.
   388      shard1 = temp_path + '-00000-of-00002.output'
   389      shard2 = temp_path + '-00001-of-00002.output'
   390      with open(shard1, 'w') as f:
   391        f.write('foo')
   392      with open(shard2, 'w') as f:
   393        f.write('foo')
   394  
   395      error_str = 'mock rename error description'
   396      delete_mock.side_effect = BeamIOError(
   397          'mock rename error', {shard2: error_str})
   398      with self.assertRaisesRegex(Exception, error_str):
   399        sink.pre_finalize(init_token, [res1, res2])
   400  
   401  
   402  if __name__ == '__main__':
   403    logging.getLogger().setLevel(logging.INFO)
   404    unittest.main()