github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filesystemio.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Utilities for ``FileSystem`` implementations.""" 19 20 # pytype: skip-file 21 22 import abc 23 import io 24 import os 25 26 __all__ = [ 27 'Downloader', 28 'Uploader', 29 'DownloaderStream', 30 'UploaderStream', 31 'PipeStream' 32 ] 33 34 35 class Downloader(metaclass=abc.ABCMeta): 36 """Download interface for a single file. 37 38 Implementations should support random access reads. 39 """ 40 @property 41 @abc.abstractmethod 42 def size(self): 43 """Size of file to download.""" 44 45 @abc.abstractmethod 46 def get_range(self, start, end): 47 """Retrieve a given byte range [start, end) from this download. 48 49 Range must be in this form: 50 0 <= start < end: Fetch the bytes from start to end. 51 52 Args: 53 start: (int) Initial byte offset. 54 end: (int) Final byte offset, exclusive. 55 56 Returns: 57 (string) A buffer containing the requested data. 58 """ 59 60 61 class Uploader(metaclass=abc.ABCMeta): 62 """Upload interface for a single file.""" 63 @abc.abstractmethod 64 def put(self, data): 65 """Write data to file sequentially. 66 67 Args: 68 data: (memoryview) Data to write. 69 """ 70 71 @abc.abstractmethod 72 def finish(self): 73 """Signal to upload any remaining data and close the file. 74 75 File should be fully written upon return from this method. 76 77 Raises: 78 Any error encountered during the upload. 79 """ 80 81 82 class DownloaderStream(io.RawIOBase): 83 """Provides a stream interface for Downloader objects.""" 84 def __init__( 85 self, downloader, read_buffer_size=io.DEFAULT_BUFFER_SIZE, mode='rb'): 86 """Initializes the stream. 87 88 Args: 89 downloader: (Downloader) Filesystem dependent implementation. 90 read_buffer_size: (int) Buffer size to use during read operations. 91 mode: (string) Python mode attribute for this stream. 92 """ 93 self._downloader = downloader 94 self.mode = mode 95 self._position = 0 96 self._reader_buffer_size = read_buffer_size 97 98 def readinto(self, b): 99 """Read up to len(b) bytes into b. 100 101 Returns number of bytes read (0 for EOF). 102 103 Args: 104 b: (bytearray/memoryview) Buffer to read into. 105 """ 106 self._checkClosed() 107 if self._position >= self._downloader.size: 108 return 0 109 110 start = self._position 111 end = min(self._position + len(b), self._downloader.size) 112 data = self._downloader.get_range(start, end) 113 self._position += len(data) 114 b[:len(data)] = data 115 return len(data) 116 117 def seek(self, offset, whence=os.SEEK_SET): 118 """Set the stream's current offset. 119 120 Note if the new offset is out of bound, it is adjusted to either 0 or EOF. 121 122 Args: 123 offset: seek offset as number. 124 whence: seek mode. Supported modes are os.SEEK_SET (absolute seek), 125 os.SEEK_CUR (seek relative to the current position), and os.SEEK_END 126 (seek relative to the end, offset should be negative). 127 128 Raises: 129 ``ValueError``: When this stream is closed or if whence is invalid. 130 """ 131 self._checkClosed() 132 133 if whence == os.SEEK_SET: 134 self._position = offset 135 elif whence == os.SEEK_CUR: 136 self._position += offset 137 elif whence == os.SEEK_END: 138 self._position = self._downloader.size + offset 139 else: 140 raise ValueError('Whence mode %r is invalid.' % whence) 141 142 self._position = min(self._position, self._downloader.size) 143 self._position = max(self._position, 0) 144 return self._position 145 146 def tell(self): 147 """Tell the stream's current offset. 148 149 Returns: 150 current offset in reading this stream. 151 152 Raises: 153 ``ValueError``: When this stream is closed. 154 """ 155 self._checkClosed() 156 return self._position 157 158 def seekable(self): 159 return True 160 161 def readable(self): 162 return True 163 164 def readall(self): 165 """Read until EOF, using multiple read() call.""" 166 res = [] 167 while True: 168 data = self.read(self._reader_buffer_size) 169 if not data: 170 break 171 res.append(data) 172 return b''.join(res) 173 174 175 class UploaderStream(io.RawIOBase): 176 """Provides a stream interface for Uploader objects.""" 177 def __init__(self, uploader, mode='wb'): 178 """Initializes the stream. 179 180 Args: 181 uploader: (Uploader) Filesystem dependent implementation. 182 mode: (string) Python mode attribute for this stream. 183 """ 184 self._uploader = uploader 185 self.mode = mode 186 self._position = 0 187 188 def tell(self): 189 return self._position 190 191 def write(self, b): 192 """Write bytes from b. 193 194 Returns number of bytes written (<= len(b)). 195 196 Args: 197 b: (memoryview) Buffer with data to write. 198 """ 199 self._checkClosed() 200 self._uploader.put(b) 201 202 bytes_written = len(b) 203 self._position += bytes_written 204 return bytes_written 205 206 def close(self): 207 """Complete the upload and close this stream. 208 209 This method has no effect if the stream is already closed. 210 211 Raises: 212 Any error encountered by the uploader. 213 """ 214 if not self.closed: 215 self._uploader.finish() 216 217 super().close() 218 219 def writable(self): 220 return True 221 222 223 class PipeStream(object): 224 """A class that presents a pipe connection as a readable stream. 225 226 Not thread-safe. 227 228 Remembers the last ``size`` bytes read and allows rewinding the stream by that 229 amount exactly. See BEAM-6380 for more. 230 """ 231 def __init__(self, recv_pipe): 232 self.conn = recv_pipe 233 self.closed = False 234 self.position = 0 235 self.remaining = b'' 236 237 # Data and position of last block streamed. Allows limited seeking backwards 238 # of stream. 239 self.last_block_position = None 240 self.last_block = b'' 241 242 def read(self, size): 243 """Read data from the wrapped pipe connection. 244 245 Args: 246 size: Number of bytes to read. Actual number of bytes read is always 247 equal to size unless EOF is reached. 248 249 Returns: 250 data read as str. 251 """ 252 data_list = [] 253 bytes_read = 0 254 last_block_position = self.position 255 256 while bytes_read < size: 257 bytes_from_remaining = min(size - bytes_read, len(self.remaining)) 258 data_list.append(self.remaining[0:bytes_from_remaining]) 259 self.remaining = self.remaining[bytes_from_remaining:] 260 self.position += bytes_from_remaining 261 bytes_read += bytes_from_remaining 262 if not self.remaining: 263 try: 264 self.remaining = self.conn.recv_bytes() 265 except EOFError: 266 break 267 268 last_block = b''.join(data_list) 269 if last_block: 270 self.last_block_position = last_block_position 271 self.last_block = last_block 272 return last_block 273 274 def tell(self): 275 """Tell the file's current offset. 276 277 Returns: 278 current offset in reading this file. 279 280 Raises: 281 ``ValueError``: When this stream is closed. 282 """ 283 self._check_open() 284 return self.position 285 286 def seek(self, offset, whence=os.SEEK_SET): 287 # The apitools library used by the gcsio.Uploader class insists on seeking 288 # to the end of a stream to do a check before completing an upload, so we 289 # must have this no-op method here in that case. 290 if whence == os.SEEK_END and offset == 0: 291 return 292 elif whence == os.SEEK_SET: 293 if offset == self.position: 294 return 295 elif offset == self.last_block_position and self.last_block: 296 self.position = offset 297 self.remaining = b''.join([self.last_block, self.remaining]) 298 self.last_block = b'' 299 return 300 raise NotImplementedError( 301 'offset: %s, whence: %s, position: %s, last: %s' % 302 (offset, whence, self.position, self.last_block_position)) 303 304 def _check_open(self): 305 if self.closed: 306 raise IOError('Stream is closed.')