github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/coders/stream.pyx (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  # cython: language_level=3
    19  
    20  """Compiled version of the Stream objects used by CoderImpl.
    21  
    22  For internal use only; no backwards-compatibility guarantees.
    23  """
    24  
    25  cimport libc.stdlib
    26  cimport libc.string
    27  
    28  
    29  cdef class OutputStream(object):
    30    """An output string stream implementation supporting write() and get()."""
    31  
    32    #TODO(robertwb): Consider using raw C++ streams.
    33  
    34    def __cinit__(self):
    35      self.buffer_size = 1024
    36      self.pos = 0
    37      self.data = <char*>libc.stdlib.malloc(self.buffer_size)
    38      assert self.data, "OutputStream malloc failed."
    39  
    40    def __dealloc__(self):
    41      if self.data:
    42        libc.stdlib.free(self.data)
    43  
    44    cpdef write(self, bytes b, bint nested=False):
    45      cdef size_t blen = len(b)
    46      if nested:
    47        self.write_var_int64(blen)
    48      if self.buffer_size < self.pos + blen:
    49        self.extend(blen)
    50      libc.string.memcpy(self.data + self.pos, <char*>b, blen)
    51      self.pos += blen
    52  
    53    cpdef write_byte(self, unsigned char val):
    54      if  self.buffer_size < self.pos + 1:
    55        self.extend(1)
    56      self.data[self.pos] = val
    57      self.pos += 1
    58  
    59    cpdef write_var_int64(self, libc.stdint.int64_t signed_v):
    60      """Encode a long using variable-length encoding to a stream."""
    61      cdef libc.stdint.uint64_t v = signed_v
    62      # Inline common case.
    63      if v <= 0x7F and self.pos < self.buffer_size - 1:
    64        self.data[self.pos] = v
    65        self.pos += 1
    66        return
    67  
    68      cdef long bits
    69      while True:
    70        bits = v & 0x7F
    71        v >>= 7
    72        if v:
    73          bits |= 0x80
    74        self.write_byte(<unsigned char>bits)
    75        if not v:
    76          break
    77  
    78    cpdef write_bigendian_int64(self, libc.stdint.int64_t signed_v):
    79      self.write_bigendian_uint64(signed_v)
    80  
    81    cpdef write_bigendian_uint64(self, libc.stdint.uint64_t v):
    82      if  self.buffer_size < self.pos + 8:
    83        self.extend(8)
    84      self.data[self.pos    ] = <unsigned char>(v >> 56)
    85      self.data[self.pos + 1] = <unsigned char>(v >> 48)
    86      self.data[self.pos + 2] = <unsigned char>(v >> 40)
    87      self.data[self.pos + 3] = <unsigned char>(v >> 32)
    88      self.data[self.pos + 4] = <unsigned char>(v >> 24)
    89      self.data[self.pos + 5] = <unsigned char>(v >> 16)
    90      self.data[self.pos + 6] = <unsigned char>(v >>  8)
    91      self.data[self.pos + 7] = <unsigned char>(v      )
    92      self.pos += 8
    93  
    94    cpdef write_bigendian_int32(self, libc.stdint.int32_t signed_v):
    95      cdef libc.stdint.uint32_t v = signed_v
    96      if  self.buffer_size < self.pos + 4:
    97        self.extend(4)
    98      self.data[self.pos    ] = <unsigned char>(v >> 24)
    99      self.data[self.pos + 1] = <unsigned char>(v >> 16)
   100      self.data[self.pos + 2] = <unsigned char>(v >>  8)
   101      self.data[self.pos + 3] = <unsigned char>(v      )
   102      self.pos += 4
   103  
   104    cpdef write_bigendian_int16(self, libc.stdint.int16_t signed_v):
   105      cdef libc.stdint.uint16_t v = signed_v
   106      if  self.buffer_size < self.pos + 2:
   107        self.extend(2)
   108      self.data[self.pos    ] = <unsigned char>(v >>  8)
   109      self.data[self.pos + 1] = <unsigned char>(v      )
   110      self.pos += 2
   111  
   112    cpdef write_bigendian_double(self, double d):
   113      self.write_bigendian_int64((<libc.stdint.int64_t*><char*>&d)[0])
   114  
   115    cpdef write_bigendian_float(self, float f):
   116      self.write_bigendian_int32((<libc.stdint.int32_t*><char*>&f)[0])
   117  
   118    cpdef bytes get(self):
   119      return self.data[:self.pos]
   120  
   121    cpdef size_t size(self) except? -1:
   122      return self.pos
   123  
   124    cdef extend(self, size_t missing):
   125      while missing > self.buffer_size - self.pos:
   126        self.buffer_size *= 2
   127      self.data = <char*>libc.stdlib.realloc(self.data, self.buffer_size)
   128      assert self.data, "OutputStream realloc failed."
   129  
   130    cpdef _clear(self):
   131      self.pos = 0
   132  
   133  
   134  cdef class ByteCountingOutputStream(OutputStream):
   135    """An output string stream implementation that only counts the bytes.
   136  
   137    This implementation counts the number of bytes it "writes" but
   138    doesn't actually write them anyway.  Thus it has write() but not
   139    get().  get_count() returns how many bytes were written.
   140  
   141    This is useful for sizing an encoding.
   142    """
   143  
   144    def __cinit__(self):
   145      self.count = 0
   146  
   147    cpdef write(self, bytes b, bint nested=False):
   148      cdef size_t blen = len(b)
   149      if nested:
   150        self.write_var_int64(blen)
   151      self.count += blen
   152  
   153    cpdef write_var_int64(self, libc.stdint.int64_t signed_v):
   154      self.count += get_varint_size(signed_v)
   155  
   156    cpdef write_byte(self, unsigned char _):
   157      self.count += 1
   158  
   159    cpdef write_bigendian_int64(self, libc.stdint.int64_t _):
   160      self.count += 8
   161  
   162    cpdef write_bigendian_uint64(self, libc.stdint.uint64_t _):
   163      self.count += 8
   164  
   165    cpdef write_bigendian_int32(self, libc.stdint.int32_t _):
   166      self.count += 4
   167  
   168    cpdef write_bigendian_int16(self, libc.stdint.int16_t _):
   169      self.count += 2
   170  
   171    cpdef size_t get_count(self):
   172      return self.count
   173  
   174    cpdef bytes get(self):
   175      raise NotImplementedError
   176  
   177    def __str__(self):
   178      return '<%s %s>' % (self.__class__.__name__, self.count)
   179  
   180  
   181  cdef class InputStream(object):
   182    """An input string stream implementation supporting read() and size()."""
   183  
   184    def __init__(self, all):
   185      self.allc = self.all = all
   186  
   187    cpdef bytes read(self, size_t size):
   188      self.pos += size
   189      return self.allc[self.pos - size : self.pos]
   190  
   191    cpdef long read_byte(self) except? -1:
   192      self.pos += 1
   193      # Note: Some C++ compilers treats the char array below as a signed char.
   194      # This causes incorrect coder behavior unless explicitly cast to an
   195      # unsigned char here.
   196      return <long>(<unsigned char> self.allc[self.pos - 1])
   197  
   198    cpdef ssize_t size(self) except? -1:
   199      return len(self.all) - self.pos
   200  
   201    cpdef bytes read_all(self, bint nested=False):
   202      return self.read(<ssize_t>self.read_var_int64() if nested else self.size())
   203  
   204    cpdef libc.stdint.int64_t read_var_int64(self) except? -1:
   205      """Decode a variable-length encoded long from a stream."""
   206      # Inline common case.
   207      cdef long byte = <unsigned char> self.allc[self.pos]
   208      self.pos += 1
   209      if byte <= 0x7F:
   210        return byte
   211  
   212      cdef libc.stdint.int64_t bits
   213      cdef long shift = 0
   214      cdef libc.stdint.int64_t result = 0
   215      while True:
   216        bits = byte & 0x7F
   217        if (shift >= sizeof(libc.stdint.int64_t) * 8 or
   218            (shift >= (sizeof(libc.stdint.int64_t) * 8 - 1) and bits > 1)):
   219          raise RuntimeError('VarLong too long.')
   220        result |= bits << shift
   221        shift += 7
   222        if not (byte & 0x80):
   223          break
   224        byte = self.read_byte()
   225        if byte < 0:
   226          raise RuntimeError('VarInt not terminated.')
   227  
   228      return result
   229  
   230    cpdef libc.stdint.int64_t read_bigendian_int64(self) except? -1:
   231      return self.read_bigendian_uint64()
   232  
   233    cpdef libc.stdint.uint64_t read_bigendian_uint64(self) except? -1:
   234      self.pos += 8
   235      return (<unsigned char>self.allc[self.pos - 1]
   236        | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 2] <<  8
   237        | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 3] << 16
   238        | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 4] << 24
   239        | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 5] << 32
   240        | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 6] << 40
   241        | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 7] << 48
   242        | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 8] << 56)
   243  
   244    cpdef libc.stdint.int32_t read_bigendian_int32(self) except? -1:
   245      self.pos += 4
   246      return (<unsigned char>self.allc[self.pos - 1]
   247        | <libc.stdint.uint32_t><unsigned char>self.allc[self.pos - 2] <<  8
   248        | <libc.stdint.uint32_t><unsigned char>self.allc[self.pos - 3] << 16
   249        | <libc.stdint.uint32_t><unsigned char>self.allc[self.pos - 4] << 24)
   250  
   251    cpdef libc.stdint.int16_t read_bigendian_int16(self) except? -1:
   252      self.pos += 2
   253      return (<unsigned char>self.allc[self.pos - 1]
   254        | <libc.stdint.uint16_t><unsigned char>self.allc[self.pos - 2] <<  8)
   255  
   256    cpdef double read_bigendian_double(self) except? -1:
   257      cdef libc.stdint.int64_t as_long = self.read_bigendian_int64()
   258      return (<double*><char*>&as_long)[0]
   259  
   260    cpdef float read_bigendian_float(self) except? -1:
   261      cdef libc.stdint.int32_t as_int = self.read_bigendian_int32()
   262      return (<float*><char*>&as_int)[0]
   263  
   264  cpdef libc.stdint.int64_t get_varint_size(libc.stdint.int64_t value):
   265    """Returns the size of the given integer value when encode as a VarInt."""
   266    cdef libc.stdint.int64_t varint_size = 0
   267    cdef libc.stdint.uint64_t bits = value
   268    while True:
   269      varint_size += 1
   270      bits >>= 7
   271      if not bits:
   272        break
   273    return varint_size