github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filesystemio_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Tests for filesystemio.""" 19 20 # pytype: skip-file 21 22 import io 23 import logging 24 import multiprocessing 25 import os 26 import threading 27 import unittest 28 29 from apache_beam.io import filesystemio 30 31 _LOGGER = logging.getLogger(__name__) 32 33 34 class FakeDownloader(filesystemio.Downloader): 35 def __init__(self, data): 36 self._data = data 37 self.last_read_size = -1 38 39 @property 40 def size(self): 41 return len(self._data) 42 43 def get_range(self, start, end): 44 self.last_read_size = end - start 45 return self._data[start:end] 46 47 48 class FakeUploader(filesystemio.Uploader): 49 def __init__(self): 50 self.data = b'' 51 self.last_write_size = -1 52 self.finished = False 53 54 def last_error(self): 55 return None 56 57 def put(self, data): 58 assert not self.finished 59 self.data += data.tobytes() 60 self.last_write_size = len(data) 61 62 def finish(self): 63 self.finished = True 64 65 66 class TestDownloaderStream(unittest.TestCase): 67 def test_file_attributes(self): 68 downloader = FakeDownloader(data=None) 69 stream = filesystemio.DownloaderStream(downloader) 70 self.assertEqual(stream.mode, 'rb') 71 self.assertTrue(stream.readable()) 72 self.assertFalse(stream.writable()) 73 self.assertTrue(stream.seekable()) 74 75 def test_read_empty(self): 76 downloader = FakeDownloader(data=b'') 77 stream = filesystemio.DownloaderStream(downloader) 78 self.assertEqual(stream.read(), b'') 79 80 def test_read(self): 81 data = b'abcde' 82 downloader = FakeDownloader(data) 83 stream = filesystemio.DownloaderStream(downloader) 84 85 # Read size is exactly what was passed to read() (unbuffered). 86 self.assertEqual(stream.read(1), data[0:1]) 87 self.assertEqual(downloader.last_read_size, 1) 88 self.assertEqual(stream.read(), data[1:]) 89 self.assertEqual(downloader.last_read_size, len(data) - 1) 90 91 def test_read_buffered(self): 92 data = b'abcde' 93 downloader = FakeDownloader(data) 94 buffer_size = 2 95 stream = io.BufferedReader( 96 filesystemio.DownloaderStream(downloader), buffer_size) 97 98 # Verify that buffering works and is reading ahead. 99 self.assertEqual(stream.read(1), data[0:1]) 100 self.assertEqual(downloader.last_read_size, buffer_size) 101 self.assertEqual(stream.read(), data[1:]) 102 103 104 class TestUploaderStream(unittest.TestCase): 105 def test_file_attributes(self): 106 uploader = FakeUploader() 107 stream = filesystemio.UploaderStream(uploader) 108 self.assertEqual(stream.mode, 'wb') 109 self.assertFalse(stream.readable()) 110 self.assertTrue(stream.writable()) 111 self.assertFalse(stream.seekable()) 112 113 def test_write_empty(self): 114 uploader = FakeUploader() 115 stream = filesystemio.UploaderStream(uploader) 116 data = b'' 117 stream.write(memoryview(data)) 118 self.assertEqual(uploader.data, data) 119 120 def test_write(self): 121 data = b'abcde' 122 uploader = FakeUploader() 123 stream = filesystemio.UploaderStream(uploader) 124 125 # Unbuffered writes. 126 stream.write(memoryview(data[0:1])) 127 self.assertEqual(uploader.data[0], data[0]) 128 self.assertEqual(uploader.last_write_size, 1) 129 stream.write(memoryview(data[1:])) 130 self.assertEqual(uploader.data, data) 131 self.assertEqual(uploader.last_write_size, len(data) - 1) 132 133 def test_write_buffered(self): 134 data = b'abcde' 135 uploader = FakeUploader() 136 buffer_size = 2 137 stream = io.BufferedWriter( 138 filesystemio.UploaderStream(uploader), buffer_size) 139 140 # Verify that buffering works: doesn't write to uploader until buffer is 141 # filled. 142 stream.write(data[0:1]) 143 self.assertEqual(-1, uploader.last_write_size) 144 stream.write(data[1:]) 145 stream.close() 146 self.assertEqual(data, uploader.data) 147 148 149 class TestPipeStream(unittest.TestCase): 150 def _read_and_verify(self, stream, expected, buffer_size, success): 151 data_list = [] 152 bytes_read = 0 153 seen_last_block = False 154 while True: 155 data = stream.read(buffer_size) 156 self.assertLessEqual(len(data), buffer_size) 157 if len(data) < buffer_size: 158 # Test the constraint that the pipe stream returns less than the buffer 159 # size only when at the end of the stream. 160 if data: 161 self.assertFalse(seen_last_block) 162 seen_last_block = True 163 if not data: 164 break 165 data_list.append(data) 166 bytes_read += len(data) 167 self.assertEqual(stream.tell(), bytes_read) 168 self.assertEqual(b''.join(data_list), expected) 169 success[0] = True 170 171 def _read_and_seek(self, stream, expected, buffer_size, success): 172 data_list = [] 173 bytes_read = 0 174 while True: 175 data = stream.read(buffer_size) 176 177 # Test bad seek positions. 178 with self.assertRaises(NotImplementedError): 179 stream.seek(bytes_read + 1) 180 with self.assertRaises(NotImplementedError): 181 stream.seek(bytes_read - 1) 182 183 # Rewind stream and test that it reads back the same data again. 184 stream.seek(bytes_read) 185 data2 = stream.read(buffer_size) 186 self.assertEqual(data, data2) 187 188 if not data: 189 break 190 data_list.append(data) 191 bytes_read += len(data) 192 self.assertEqual(stream.tell(), bytes_read) 193 self.assertEqual(len(b''.join(data_list)), len(expected)) 194 self.assertEqual(b''.join(data_list), expected) 195 success[0] = True 196 197 def test_pipe_stream(self): 198 block_sizes = list(4**i for i in range(0, 12)) 199 data_blocks = list(os.urandom(size) for size in block_sizes) 200 expected = b''.join(data_blocks) 201 202 buffer_sizes = [100001, 512 * 1024, 1024 * 1024] 203 204 for buffer_size in buffer_sizes: 205 for target in [self._read_and_verify, self._read_and_seek]: 206 _LOGGER.info('buffer_size=%s, target=%s' % (buffer_size, target)) 207 parent_conn, child_conn = multiprocessing.Pipe() 208 stream = filesystemio.PipeStream(child_conn) 209 success = [False] 210 child_thread = threading.Thread( 211 target=target, args=(stream, expected, buffer_size, success)) 212 child_thread.start() 213 for data in data_blocks: 214 parent_conn.send_bytes(data) 215 parent_conn.close() 216 child_thread.join() 217 self.assertTrue(success[0], 'error in test thread') 218 219 def test_pipe_stream_rewind_buffer(self): 220 buffer_size = 512 221 data = os.urandom(buffer_size) 222 223 parent_conn, child_conn = multiprocessing.Pipe() 224 parent_conn.send_bytes(data) 225 parent_conn.close() 226 stream = filesystemio.PipeStream(child_conn) 227 228 # Regular read. 229 read_data = stream.read(buffer_size) 230 self.assertEqual(data, read_data) 231 232 # Rewind buffer_size bytes. 233 stream.seek(0) 234 read_data = stream.read(buffer_size) 235 self.assertEqual(data, read_data) 236 237 # Read 0 bytes. Rewind buffer still points to offset 0. 238 read_data = stream.read(buffer_size) 239 self.assertFalse(read_data) 240 stream.seek(0) 241 read_data = stream.read(buffer_size) 242 self.assertEqual(data, read_data) 243 244 245 if __name__ == '__main__': 246 logging.getLogger().setLevel(logging.INFO) 247 unittest.main()