github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/filesystemio.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Utilities for ``FileSystem`` implementations."""
    19  
    20  # pytype: skip-file
    21  
    22  import abc
    23  import io
    24  import os
    25  
    26  __all__ = [
    27      'Downloader',
    28      'Uploader',
    29      'DownloaderStream',
    30      'UploaderStream',
    31      'PipeStream'
    32  ]
    33  
    34  
    35  class Downloader(metaclass=abc.ABCMeta):
    36    """Download interface for a single file.
    37  
    38    Implementations should support random access reads.
    39    """
    40    @property
    41    @abc.abstractmethod
    42    def size(self):
    43      """Size of file to download."""
    44  
    45    @abc.abstractmethod
    46    def get_range(self, start, end):
    47      """Retrieve a given byte range [start, end) from this download.
    48  
    49      Range must be in this form:
    50        0 <= start < end: Fetch the bytes from start to end.
    51  
    52      Args:
    53        start: (int) Initial byte offset.
    54        end: (int) Final byte offset, exclusive.
    55  
    56      Returns:
    57        (string) A buffer containing the requested data.
    58      """
    59  
    60  
    61  class Uploader(metaclass=abc.ABCMeta):
    62    """Upload interface for a single file."""
    63    @abc.abstractmethod
    64    def put(self, data):
    65      """Write data to file sequentially.
    66  
    67      Args:
    68        data: (memoryview) Data to write.
    69      """
    70  
    71    @abc.abstractmethod
    72    def finish(self):
    73      """Signal to upload any remaining data and close the file.
    74  
    75      File should be fully written upon return from this method.
    76  
    77      Raises:
    78        Any error encountered during the upload.
    79      """
    80  
    81  
    82  class DownloaderStream(io.RawIOBase):
    83    """Provides a stream interface for Downloader objects."""
    84    def __init__(
    85        self, downloader, read_buffer_size=io.DEFAULT_BUFFER_SIZE, mode='rb'):
    86      """Initializes the stream.
    87  
    88      Args:
    89        downloader: (Downloader) Filesystem dependent implementation.
    90        read_buffer_size: (int) Buffer size to use during read operations.
    91        mode: (string) Python mode attribute for this stream.
    92      """
    93      self._downloader = downloader
    94      self.mode = mode
    95      self._position = 0
    96      self._reader_buffer_size = read_buffer_size
    97  
    98    def readinto(self, b):
    99      """Read up to len(b) bytes into b.
   100  
   101      Returns number of bytes read (0 for EOF).
   102  
   103      Args:
   104        b: (bytearray/memoryview) Buffer to read into.
   105      """
   106      self._checkClosed()
   107      if self._position >= self._downloader.size:
   108        return 0
   109  
   110      start = self._position
   111      end = min(self._position + len(b), self._downloader.size)
   112      data = self._downloader.get_range(start, end)
   113      self._position += len(data)
   114      b[:len(data)] = data
   115      return len(data)
   116  
   117    def seek(self, offset, whence=os.SEEK_SET):
   118      """Set the stream's current offset.
   119  
   120      Note if the new offset is out of bound, it is adjusted to either 0 or EOF.
   121  
   122      Args:
   123        offset: seek offset as number.
   124        whence: seek mode. Supported modes are os.SEEK_SET (absolute seek),
   125          os.SEEK_CUR (seek relative to the current position), and os.SEEK_END
   126          (seek relative to the end, offset should be negative).
   127  
   128      Raises:
   129        ``ValueError``: When this stream is closed or if whence is invalid.
   130      """
   131      self._checkClosed()
   132  
   133      if whence == os.SEEK_SET:
   134        self._position = offset
   135      elif whence == os.SEEK_CUR:
   136        self._position += offset
   137      elif whence == os.SEEK_END:
   138        self._position = self._downloader.size + offset
   139      else:
   140        raise ValueError('Whence mode %r is invalid.' % whence)
   141  
   142      self._position = min(self._position, self._downloader.size)
   143      self._position = max(self._position, 0)
   144      return self._position
   145  
   146    def tell(self):
   147      """Tell the stream's current offset.
   148  
   149      Returns:
   150        current offset in reading this stream.
   151  
   152      Raises:
   153        ``ValueError``: When this stream is closed.
   154      """
   155      self._checkClosed()
   156      return self._position
   157  
   158    def seekable(self):
   159      return True
   160  
   161    def readable(self):
   162      return True
   163  
   164    def readall(self):
   165      """Read until EOF, using multiple read() call."""
   166      res = []
   167      while True:
   168        data = self.read(self._reader_buffer_size)
   169        if not data:
   170          break
   171        res.append(data)
   172      return b''.join(res)
   173  
   174  
   175  class UploaderStream(io.RawIOBase):
   176    """Provides a stream interface for Uploader objects."""
   177    def __init__(self, uploader, mode='wb'):
   178      """Initializes the stream.
   179  
   180      Args:
   181        uploader: (Uploader) Filesystem dependent implementation.
   182        mode: (string) Python mode attribute for this stream.
   183      """
   184      self._uploader = uploader
   185      self.mode = mode
   186      self._position = 0
   187  
   188    def tell(self):
   189      return self._position
   190  
   191    def write(self, b):
   192      """Write bytes from b.
   193  
   194      Returns number of bytes written (<= len(b)).
   195  
   196      Args:
   197        b: (memoryview) Buffer with data to write.
   198      """
   199      self._checkClosed()
   200      self._uploader.put(b)
   201  
   202      bytes_written = len(b)
   203      self._position += bytes_written
   204      return bytes_written
   205  
   206    def close(self):
   207      """Complete the upload and close this stream.
   208  
   209      This method has no effect if the stream is already closed.
   210  
   211      Raises:
   212        Any error encountered by the uploader.
   213      """
   214      if not self.closed:
   215        self._uploader.finish()
   216  
   217      super().close()
   218  
   219    def writable(self):
   220      return True
   221  
   222  
   223  class PipeStream(object):
   224    """A class that presents a pipe connection as a readable stream.
   225  
   226    Not thread-safe.
   227  
   228    Remembers the last ``size`` bytes read and allows rewinding the stream by that
   229    amount exactly. See BEAM-6380 for more.
   230    """
   231    def __init__(self, recv_pipe):
   232      self.conn = recv_pipe
   233      self.closed = False
   234      self.position = 0
   235      self.remaining = b''
   236  
   237      # Data and position of last block streamed. Allows limited seeking backwards
   238      # of stream.
   239      self.last_block_position = None
   240      self.last_block = b''
   241  
   242    def read(self, size):
   243      """Read data from the wrapped pipe connection.
   244  
   245      Args:
   246        size: Number of bytes to read. Actual number of bytes read is always
   247              equal to size unless EOF is reached.
   248  
   249      Returns:
   250        data read as str.
   251      """
   252      data_list = []
   253      bytes_read = 0
   254      last_block_position = self.position
   255  
   256      while bytes_read < size:
   257        bytes_from_remaining = min(size - bytes_read, len(self.remaining))
   258        data_list.append(self.remaining[0:bytes_from_remaining])
   259        self.remaining = self.remaining[bytes_from_remaining:]
   260        self.position += bytes_from_remaining
   261        bytes_read += bytes_from_remaining
   262        if not self.remaining:
   263          try:
   264            self.remaining = self.conn.recv_bytes()
   265          except EOFError:
   266            break
   267  
   268      last_block = b''.join(data_list)
   269      if last_block:
   270        self.last_block_position = last_block_position
   271        self.last_block = last_block
   272      return last_block
   273  
   274    def tell(self):
   275      """Tell the file's current offset.
   276  
   277      Returns:
   278        current offset in reading this file.
   279  
   280      Raises:
   281        ``ValueError``: When this stream is closed.
   282      """
   283      self._check_open()
   284      return self.position
   285  
   286    def seek(self, offset, whence=os.SEEK_SET):
   287      # The apitools library used by the gcsio.Uploader class insists on seeking
   288      # to the end of a stream to do a check before completing an upload, so we
   289      # must have this no-op method here in that case.
   290      if whence == os.SEEK_END and offset == 0:
   291        return
   292      elif whence == os.SEEK_SET:
   293        if offset == self.position:
   294          return
   295        elif offset == self.last_block_position and self.last_block:
   296          self.position = offset
   297          self.remaining = b''.join([self.last_block, self.remaining])
   298          self.last_block = b''
   299          return
   300      raise NotImplementedError(
   301          'offset: %s, whence: %s, position: %s, last: %s' %
   302          (offset, whence, self.position, self.last_block_position))
   303  
   304    def _check_open(self):
   305      if self.closed:
   306        raise IOError('Stream is closed.')