github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/coders/stream.pyx (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 # cython: language_level=3 19 20 """Compiled version of the Stream objects used by CoderImpl. 21 22 For internal use only; no backwards-compatibility guarantees. 23 """ 24 25 cimport libc.stdlib 26 cimport libc.string 27 28 29 cdef class OutputStream(object): 30 """An output string stream implementation supporting write() and get().""" 31 32 #TODO(robertwb): Consider using raw C++ streams. 33 34 def __cinit__(self): 35 self.buffer_size = 1024 36 self.pos = 0 37 self.data = <char*>libc.stdlib.malloc(self.buffer_size) 38 assert self.data, "OutputStream malloc failed." 39 40 def __dealloc__(self): 41 if self.data: 42 libc.stdlib.free(self.data) 43 44 cpdef write(self, bytes b, bint nested=False): 45 cdef size_t blen = len(b) 46 if nested: 47 self.write_var_int64(blen) 48 if self.buffer_size < self.pos + blen: 49 self.extend(blen) 50 libc.string.memcpy(self.data + self.pos, <char*>b, blen) 51 self.pos += blen 52 53 cpdef write_byte(self, unsigned char val): 54 if self.buffer_size < self.pos + 1: 55 self.extend(1) 56 self.data[self.pos] = val 57 self.pos += 1 58 59 cpdef write_var_int64(self, libc.stdint.int64_t signed_v): 60 """Encode a long using variable-length encoding to a stream.""" 61 cdef libc.stdint.uint64_t v = signed_v 62 # Inline common case. 63 if v <= 0x7F and self.pos < self.buffer_size - 1: 64 self.data[self.pos] = v 65 self.pos += 1 66 return 67 68 cdef long bits 69 while True: 70 bits = v & 0x7F 71 v >>= 7 72 if v: 73 bits |= 0x80 74 self.write_byte(<unsigned char>bits) 75 if not v: 76 break 77 78 cpdef write_bigendian_int64(self, libc.stdint.int64_t signed_v): 79 self.write_bigendian_uint64(signed_v) 80 81 cpdef write_bigendian_uint64(self, libc.stdint.uint64_t v): 82 if self.buffer_size < self.pos + 8: 83 self.extend(8) 84 self.data[self.pos ] = <unsigned char>(v >> 56) 85 self.data[self.pos + 1] = <unsigned char>(v >> 48) 86 self.data[self.pos + 2] = <unsigned char>(v >> 40) 87 self.data[self.pos + 3] = <unsigned char>(v >> 32) 88 self.data[self.pos + 4] = <unsigned char>(v >> 24) 89 self.data[self.pos + 5] = <unsigned char>(v >> 16) 90 self.data[self.pos + 6] = <unsigned char>(v >> 8) 91 self.data[self.pos + 7] = <unsigned char>(v ) 92 self.pos += 8 93 94 cpdef write_bigendian_int32(self, libc.stdint.int32_t signed_v): 95 cdef libc.stdint.uint32_t v = signed_v 96 if self.buffer_size < self.pos + 4: 97 self.extend(4) 98 self.data[self.pos ] = <unsigned char>(v >> 24) 99 self.data[self.pos + 1] = <unsigned char>(v >> 16) 100 self.data[self.pos + 2] = <unsigned char>(v >> 8) 101 self.data[self.pos + 3] = <unsigned char>(v ) 102 self.pos += 4 103 104 cpdef write_bigendian_int16(self, libc.stdint.int16_t signed_v): 105 cdef libc.stdint.uint16_t v = signed_v 106 if self.buffer_size < self.pos + 2: 107 self.extend(2) 108 self.data[self.pos ] = <unsigned char>(v >> 8) 109 self.data[self.pos + 1] = <unsigned char>(v ) 110 self.pos += 2 111 112 cpdef write_bigendian_double(self, double d): 113 self.write_bigendian_int64((<libc.stdint.int64_t*><char*>&d)[0]) 114 115 cpdef write_bigendian_float(self, float f): 116 self.write_bigendian_int32((<libc.stdint.int32_t*><char*>&f)[0]) 117 118 cpdef bytes get(self): 119 return self.data[:self.pos] 120 121 cpdef size_t size(self) except? -1: 122 return self.pos 123 124 cdef extend(self, size_t missing): 125 while missing > self.buffer_size - self.pos: 126 self.buffer_size *= 2 127 self.data = <char*>libc.stdlib.realloc(self.data, self.buffer_size) 128 assert self.data, "OutputStream realloc failed." 129 130 cpdef _clear(self): 131 self.pos = 0 132 133 134 cdef class ByteCountingOutputStream(OutputStream): 135 """An output string stream implementation that only counts the bytes. 136 137 This implementation counts the number of bytes it "writes" but 138 doesn't actually write them anyway. Thus it has write() but not 139 get(). get_count() returns how many bytes were written. 140 141 This is useful for sizing an encoding. 142 """ 143 144 def __cinit__(self): 145 self.count = 0 146 147 cpdef write(self, bytes b, bint nested=False): 148 cdef size_t blen = len(b) 149 if nested: 150 self.write_var_int64(blen) 151 self.count += blen 152 153 cpdef write_var_int64(self, libc.stdint.int64_t signed_v): 154 self.count += get_varint_size(signed_v) 155 156 cpdef write_byte(self, unsigned char _): 157 self.count += 1 158 159 cpdef write_bigendian_int64(self, libc.stdint.int64_t _): 160 self.count += 8 161 162 cpdef write_bigendian_uint64(self, libc.stdint.uint64_t _): 163 self.count += 8 164 165 cpdef write_bigendian_int32(self, libc.stdint.int32_t _): 166 self.count += 4 167 168 cpdef write_bigendian_int16(self, libc.stdint.int16_t _): 169 self.count += 2 170 171 cpdef size_t get_count(self): 172 return self.count 173 174 cpdef bytes get(self): 175 raise NotImplementedError 176 177 def __str__(self): 178 return '<%s %s>' % (self.__class__.__name__, self.count) 179 180 181 cdef class InputStream(object): 182 """An input string stream implementation supporting read() and size().""" 183 184 def __init__(self, all): 185 self.allc = self.all = all 186 187 cpdef bytes read(self, size_t size): 188 self.pos += size 189 return self.allc[self.pos - size : self.pos] 190 191 cpdef long read_byte(self) except? -1: 192 self.pos += 1 193 # Note: Some C++ compilers treats the char array below as a signed char. 194 # This causes incorrect coder behavior unless explicitly cast to an 195 # unsigned char here. 196 return <long>(<unsigned char> self.allc[self.pos - 1]) 197 198 cpdef ssize_t size(self) except? -1: 199 return len(self.all) - self.pos 200 201 cpdef bytes read_all(self, bint nested=False): 202 return self.read(<ssize_t>self.read_var_int64() if nested else self.size()) 203 204 cpdef libc.stdint.int64_t read_var_int64(self) except? -1: 205 """Decode a variable-length encoded long from a stream.""" 206 # Inline common case. 207 cdef long byte = <unsigned char> self.allc[self.pos] 208 self.pos += 1 209 if byte <= 0x7F: 210 return byte 211 212 cdef libc.stdint.int64_t bits 213 cdef long shift = 0 214 cdef libc.stdint.int64_t result = 0 215 while True: 216 bits = byte & 0x7F 217 if (shift >= sizeof(libc.stdint.int64_t) * 8 or 218 (shift >= (sizeof(libc.stdint.int64_t) * 8 - 1) and bits > 1)): 219 raise RuntimeError('VarLong too long.') 220 result |= bits << shift 221 shift += 7 222 if not (byte & 0x80): 223 break 224 byte = self.read_byte() 225 if byte < 0: 226 raise RuntimeError('VarInt not terminated.') 227 228 return result 229 230 cpdef libc.stdint.int64_t read_bigendian_int64(self) except? -1: 231 return self.read_bigendian_uint64() 232 233 cpdef libc.stdint.uint64_t read_bigendian_uint64(self) except? -1: 234 self.pos += 8 235 return (<unsigned char>self.allc[self.pos - 1] 236 | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 2] << 8 237 | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 3] << 16 238 | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 4] << 24 239 | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 5] << 32 240 | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 6] << 40 241 | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 7] << 48 242 | <libc.stdint.uint64_t><unsigned char>self.allc[self.pos - 8] << 56) 243 244 cpdef libc.stdint.int32_t read_bigendian_int32(self) except? -1: 245 self.pos += 4 246 return (<unsigned char>self.allc[self.pos - 1] 247 | <libc.stdint.uint32_t><unsigned char>self.allc[self.pos - 2] << 8 248 | <libc.stdint.uint32_t><unsigned char>self.allc[self.pos - 3] << 16 249 | <libc.stdint.uint32_t><unsigned char>self.allc[self.pos - 4] << 24) 250 251 cpdef libc.stdint.int16_t read_bigendian_int16(self) except? -1: 252 self.pos += 2 253 return (<unsigned char>self.allc[self.pos - 1] 254 | <libc.stdint.uint16_t><unsigned char>self.allc[self.pos - 2] << 8) 255 256 cpdef double read_bigendian_double(self) except? -1: 257 cdef libc.stdint.int64_t as_long = self.read_bigendian_int64() 258 return (<double*><char*>&as_long)[0] 259 260 cpdef float read_bigendian_float(self) except? -1: 261 cdef libc.stdint.int32_t as_int = self.read_bigendian_int32() 262 return (<float*><char*>&as_int)[0] 263 264 cpdef libc.stdint.int64_t get_varint_size(libc.stdint.int64_t value): 265 """Returns the size of the given integer value when encode as a VarInt.""" 266 cdef libc.stdint.int64_t varint_size = 0 267 cdef libc.stdint.uint64_t bits = value 268 while True: 269 varint_size += 1 270 bits >>= 7 271 if not bits: 272 break 273 return varint_size