github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filesystemio_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Tests for filesystemio."""
    19  
    20  # pytype: skip-file
    21  
    22  import io
    23  import logging
    24  import multiprocessing
    25  import os
    26  import threading
    27  import unittest
    28  
    29  from apache_beam.io import filesystemio
    30  
    31  _LOGGER = logging.getLogger(__name__)
    32  
    33  
    34  class FakeDownloader(filesystemio.Downloader):
    35    def __init__(self, data):
    36      self._data = data
    37      self.last_read_size = -1
    38  
    39    @property
    40    def size(self):
    41      return len(self._data)
    42  
    43    def get_range(self, start, end):
    44      self.last_read_size = end - start
    45      return self._data[start:end]
    46  
    47  
    48  class FakeUploader(filesystemio.Uploader):
    49    def __init__(self):
    50      self.data = b''
    51      self.last_write_size = -1
    52      self.finished = False
    53  
    54    def last_error(self):
    55      return None
    56  
    57    def put(self, data):
    58      assert not self.finished
    59      self.data += data.tobytes()
    60      self.last_write_size = len(data)
    61  
    62    def finish(self):
    63      self.finished = True
    64  
    65  
    66  class TestDownloaderStream(unittest.TestCase):
    67    def test_file_attributes(self):
    68      downloader = FakeDownloader(data=None)
    69      stream = filesystemio.DownloaderStream(downloader)
    70      self.assertEqual(stream.mode, 'rb')
    71      self.assertTrue(stream.readable())
    72      self.assertFalse(stream.writable())
    73      self.assertTrue(stream.seekable())
    74  
    75    def test_read_empty(self):
    76      downloader = FakeDownloader(data=b'')
    77      stream = filesystemio.DownloaderStream(downloader)
    78      self.assertEqual(stream.read(), b'')
    79  
    80    def test_read(self):
    81      data = b'abcde'
    82      downloader = FakeDownloader(data)
    83      stream = filesystemio.DownloaderStream(downloader)
    84  
    85      # Read size is exactly what was passed to read() (unbuffered).
    86      self.assertEqual(stream.read(1), data[0:1])
    87      self.assertEqual(downloader.last_read_size, 1)
    88      self.assertEqual(stream.read(), data[1:])
    89      self.assertEqual(downloader.last_read_size, len(data) - 1)
    90  
    91    def test_read_buffered(self):
    92      data = b'abcde'
    93      downloader = FakeDownloader(data)
    94      buffer_size = 2
    95      stream = io.BufferedReader(
    96          filesystemio.DownloaderStream(downloader), buffer_size)
    97  
    98      # Verify that buffering works and is reading ahead.
    99      self.assertEqual(stream.read(1), data[0:1])
   100      self.assertEqual(downloader.last_read_size, buffer_size)
   101      self.assertEqual(stream.read(), data[1:])
   102  
   103  
   104  class TestUploaderStream(unittest.TestCase):
   105    def test_file_attributes(self):
   106      uploader = FakeUploader()
   107      stream = filesystemio.UploaderStream(uploader)
   108      self.assertEqual(stream.mode, 'wb')
   109      self.assertFalse(stream.readable())
   110      self.assertTrue(stream.writable())
   111      self.assertFalse(stream.seekable())
   112  
   113    def test_write_empty(self):
   114      uploader = FakeUploader()
   115      stream = filesystemio.UploaderStream(uploader)
   116      data = b''
   117      stream.write(memoryview(data))
   118      self.assertEqual(uploader.data, data)
   119  
   120    def test_write(self):
   121      data = b'abcde'
   122      uploader = FakeUploader()
   123      stream = filesystemio.UploaderStream(uploader)
   124  
   125      # Unbuffered writes.
   126      stream.write(memoryview(data[0:1]))
   127      self.assertEqual(uploader.data[0], data[0])
   128      self.assertEqual(uploader.last_write_size, 1)
   129      stream.write(memoryview(data[1:]))
   130      self.assertEqual(uploader.data, data)
   131      self.assertEqual(uploader.last_write_size, len(data) - 1)
   132  
   133    def test_write_buffered(self):
   134      data = b'abcde'
   135      uploader = FakeUploader()
   136      buffer_size = 2
   137      stream = io.BufferedWriter(
   138          filesystemio.UploaderStream(uploader), buffer_size)
   139  
   140      # Verify that buffering works: doesn't write to uploader until buffer is
   141      # filled.
   142      stream.write(data[0:1])
   143      self.assertEqual(-1, uploader.last_write_size)
   144      stream.write(data[1:])
   145      stream.close()
   146      self.assertEqual(data, uploader.data)
   147  
   148  
   149  class TestPipeStream(unittest.TestCase):
   150    def _read_and_verify(self, stream, expected, buffer_size, success):
   151      data_list = []
   152      bytes_read = 0
   153      seen_last_block = False
   154      while True:
   155        data = stream.read(buffer_size)
   156        self.assertLessEqual(len(data), buffer_size)
   157        if len(data) < buffer_size:
   158          # Test the constraint that the pipe stream returns less than the buffer
   159          # size only when at the end of the stream.
   160          if data:
   161            self.assertFalse(seen_last_block)
   162          seen_last_block = True
   163        if not data:
   164          break
   165        data_list.append(data)
   166        bytes_read += len(data)
   167        self.assertEqual(stream.tell(), bytes_read)
   168      self.assertEqual(b''.join(data_list), expected)
   169      success[0] = True
   170  
   171    def _read_and_seek(self, stream, expected, buffer_size, success):
   172      data_list = []
   173      bytes_read = 0
   174      while True:
   175        data = stream.read(buffer_size)
   176  
   177        # Test bad seek positions.
   178        with self.assertRaises(NotImplementedError):
   179          stream.seek(bytes_read + 1)
   180        with self.assertRaises(NotImplementedError):
   181          stream.seek(bytes_read - 1)
   182  
   183        # Rewind stream and test that it reads back the same data again.
   184        stream.seek(bytes_read)
   185        data2 = stream.read(buffer_size)
   186        self.assertEqual(data, data2)
   187  
   188        if not data:
   189          break
   190        data_list.append(data)
   191        bytes_read += len(data)
   192        self.assertEqual(stream.tell(), bytes_read)
   193      self.assertEqual(len(b''.join(data_list)), len(expected))
   194      self.assertEqual(b''.join(data_list), expected)
   195      success[0] = True
   196  
   197    def test_pipe_stream(self):
   198      block_sizes = list(4**i for i in range(0, 12))
   199      data_blocks = list(os.urandom(size) for size in block_sizes)
   200      expected = b''.join(data_blocks)
   201  
   202      buffer_sizes = [100001, 512 * 1024, 1024 * 1024]
   203  
   204      for buffer_size in buffer_sizes:
   205        for target in [self._read_and_verify, self._read_and_seek]:
   206          _LOGGER.info('buffer_size=%s, target=%s' % (buffer_size, target))
   207          parent_conn, child_conn = multiprocessing.Pipe()
   208          stream = filesystemio.PipeStream(child_conn)
   209          success = [False]
   210          child_thread = threading.Thread(
   211              target=target, args=(stream, expected, buffer_size, success))
   212          child_thread.start()
   213          for data in data_blocks:
   214            parent_conn.send_bytes(data)
   215          parent_conn.close()
   216          child_thread.join()
   217          self.assertTrue(success[0], 'error in test thread')
   218  
   219    def test_pipe_stream_rewind_buffer(self):
   220      buffer_size = 512
   221      data = os.urandom(buffer_size)
   222  
   223      parent_conn, child_conn = multiprocessing.Pipe()
   224      parent_conn.send_bytes(data)
   225      parent_conn.close()
   226      stream = filesystemio.PipeStream(child_conn)
   227  
   228      # Regular read.
   229      read_data = stream.read(buffer_size)
   230      self.assertEqual(data, read_data)
   231  
   232      # Rewind buffer_size bytes.
   233      stream.seek(0)
   234      read_data = stream.read(buffer_size)
   235      self.assertEqual(data, read_data)
   236  
   237      # Read 0 bytes. Rewind buffer still points to offset 0.
   238      read_data = stream.read(buffer_size)
   239      self.assertFalse(read_data)
   240      stream.seek(0)
   241      read_data = stream.read(buffer_size)
   242      self.assertEqual(data, read_data)
   243  
   244  
   245  if __name__ == '__main__':
   246    logging.getLogger().setLevel(logging.INFO)
   247    unittest.main()