github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/utils/bit_reader.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package utils 18 19 import ( 20 "encoding/binary" 21 "errors" 22 "io" 23 "math" 24 "reflect" 25 "unsafe" 26 27 "github.com/apache/arrow/go/v14/arrow" 28 "github.com/apache/arrow/go/v14/arrow/bitutil" 29 "github.com/apache/arrow/go/v14/arrow/memory" 30 "github.com/apache/arrow/go/v14/internal/utils" 31 ) 32 33 // masks for grabbing the trailing bits based on the number of trailing bits desired 34 var trailingMask [64]uint64 35 36 func init() { 37 // generate the masks at init so we don't have to hard code them. 38 for i := 0; i < 64; i++ { 39 trailingMask[i] = (math.MaxUint64 >> (64 - i)) 40 } 41 } 42 43 // trailingBits returns a value constructed from the bits trailing bits of 44 // the value v that is passed in. If bits >= 64, then we just return v. 45 func trailingBits(v uint64, bits uint) uint64 { 46 if bits >= 64 { 47 return v 48 } 49 return v & trailingMask[bits] 50 } 51 52 // reader is a useful interface to define the functionality we need for implementation 53 type reader interface { 54 io.Reader 55 io.ReaderAt 56 io.Seeker 57 } 58 59 // default buffer length 60 const buflen = 1024 61 62 // BitReader implements functionality for reading bits or bytes buffering up to a uint64 63 // at a time from the reader in order to improve efficiency. It also provides 64 // methods to read multiple bytes in one read such as encoded ints/values. 65 // 66 // This BitReader is the basis for the other utility classes like RLE decoding 67 // and such, providing the necessary functions for interpreting the values. 68 type BitReader struct { 69 reader reader 70 buffer uint64 71 byteoffset int64 72 bitoffset uint 73 raw [8]byte 74 75 unpackBuf [buflen]uint32 76 } 77 78 // NewBitReader takes in a reader that implements io.Reader, io.ReaderAt and io.Seeker 79 // interfaces and returns a BitReader for use with various bit level manipulations. 80 func NewBitReader(r reader) *BitReader { 81 return &BitReader{reader: r} 82 } 83 84 // CurOffset returns the current Byte offset into the data that the reader is at. 85 func (b *BitReader) CurOffset() int64 { 86 return b.byteoffset + bitutil.BytesForBits(int64(b.bitoffset)) 87 } 88 89 // Reset allows reusing a BitReader by setting a new reader and resetting the internal 90 // state back to zeros. 91 func (b *BitReader) Reset(r reader) { 92 b.reader = r 93 b.buffer = 0 94 b.byteoffset = 0 95 b.bitoffset = 0 96 } 97 98 // GetVlqInt reads a Vlq encoded int from the stream. The encoded value must start 99 // at the beginning of a byte and this returns false if there weren't enough bytes 100 // in the buffer or reader. This will call `ReadByte` which in turn retrieves byte 101 // aligned values from the reader 102 func (b *BitReader) GetVlqInt() (uint64, bool) { 103 tmp, err := binary.ReadUvarint(b) 104 if err != nil { 105 return 0, false 106 } 107 return tmp, true 108 } 109 110 // GetZigZagVlqInt reads a zigzag encoded integer, returning false if there weren't 111 // enough bytes remaining. 112 func (b *BitReader) GetZigZagVlqInt() (int64, bool) { 113 u, ok := b.GetVlqInt() 114 if !ok { 115 return 0, false 116 } 117 118 return int64(u>>1) ^ -int64(u&1), true 119 } 120 121 // ReadByte reads a single aligned byte from the underlying stream, or populating 122 // error if there aren't enough bytes left. 123 func (b *BitReader) ReadByte() (byte, error) { 124 var tmp byte 125 if ok := b.GetAligned(1, &tmp); !ok { 126 return 0, errors.New("failed to read byte") 127 } 128 129 return tmp, nil 130 } 131 132 // GetAligned reads nbytes from the underlying stream into the passed interface value. 133 // Returning false if there aren't enough bytes remaining in the stream or if an invalid 134 // type is passed. The bytes are read aligned to byte boundaries. 135 // 136 // v must be a pointer to a byte or sized uint type (*byte, *uint16, *uint32, *uint64). 137 // encoded values are assumed to be little endian. 138 func (b *BitReader) GetAligned(nbytes int, v interface{}) bool { 139 // figure out the number of bytes to represent v 140 typBytes := int(reflect.TypeOf(v).Elem().Size()) 141 if nbytes > typBytes { 142 return false 143 } 144 145 bread := bitutil.BytesForBits(int64(b.bitoffset)) 146 147 b.byteoffset += bread 148 n, err := b.reader.ReadAt(b.raw[:nbytes], b.byteoffset) 149 if err != nil && err != io.EOF { 150 return false 151 } 152 if n != nbytes { 153 return false 154 } 155 // zero pad the the bytes 156 memory.Set(b.raw[n:typBytes], 0) 157 158 switch v := v.(type) { 159 case *byte: 160 *v = b.raw[0] 161 case *uint64: 162 *v = binary.LittleEndian.Uint64(b.raw[:typBytes]) 163 case *uint32: 164 *v = binary.LittleEndian.Uint32(b.raw[:typBytes]) 165 case *uint16: 166 *v = binary.LittleEndian.Uint16(b.raw[:typBytes]) 167 default: 168 return false 169 } 170 171 b.byteoffset += int64(nbytes) 172 173 b.bitoffset = 0 174 b.fillbuffer() 175 return true 176 } 177 178 // fillbuffer fills the uint64 buffer with bytes from the underlying stream 179 func (b *BitReader) fillbuffer() error { 180 n, err := b.reader.ReadAt(b.raw[:], b.byteoffset) 181 if err != nil && n == 0 && err != io.EOF { 182 return err 183 } 184 for i := n; i < 8; i++ { 185 b.raw[i] = 0 186 } 187 b.buffer = binary.LittleEndian.Uint64(b.raw[:]) 188 return nil 189 } 190 191 // next reads an integral value from the next bits in the buffer 192 func (b *BitReader) next(bits uint) (v uint64, err error) { 193 v = trailingBits(b.buffer, b.bitoffset+bits) >> b.bitoffset 194 b.bitoffset += bits 195 // if we need more bits to get what was requested then refill the buffer 196 if b.bitoffset >= 64 { 197 b.byteoffset += 8 198 b.bitoffset -= 64 199 if err = b.fillbuffer(); err != nil { 200 return 0, err 201 } 202 v |= trailingBits(b.buffer, b.bitoffset) << (bits - b.bitoffset) 203 } 204 return 205 } 206 207 // GetBatchIndex is like GetBatch but for IndexType (used for dictionary decoding) 208 func (b *BitReader) GetBatchIndex(bits uint, out []IndexType) (i int, err error) { 209 // IndexType is a 32-bit value so bits must be less than 32 when unpacking 210 // values using the bitreader. 211 if bits > 32 { 212 return 0, errors.New("must be 32 bits or less per read") 213 } 214 215 var val uint64 216 217 length := len(out) 218 // if we're not currently byte-aligned, read bits until we are byte-aligned. 219 for ; i < length && b.bitoffset != 0; i++ { 220 val, err = b.next(bits) 221 out[i] = IndexType(val) 222 if err != nil { 223 return 224 } 225 } 226 227 b.reader.Seek(b.byteoffset, io.SeekStart) 228 // grab as many 32 byte chunks as possible in one shot 229 if i < length { // IndexType should be a 32 bit value so we can do quick unpacking right into the output 230 numUnpacked := unpack32(b.reader, (*(*[]uint32)(unsafe.Pointer(&out)))[i:], int(bits)) 231 i += numUnpacked 232 b.byteoffset += int64(numUnpacked * int(bits) / 8) 233 } 234 235 // re-fill our buffer just in case. 236 b.fillbuffer() 237 // grab the remaining values that aren't 32 byte aligned 238 for ; i < length; i++ { 239 val, err = b.next(bits) 240 out[i] = IndexType(val) 241 if err != nil { 242 break 243 } 244 } 245 return 246 } 247 248 // GetBatchBools is like GetBatch but optimized for reading bits as boolean values 249 func (b *BitReader) GetBatchBools(out []bool) (int, error) { 250 bits := uint(1) 251 length := len(out) 252 253 i := 0 254 // read until we are byte-aligned 255 for ; i < length && b.bitoffset != 0; i++ { 256 val, err := b.next(bits) 257 out[i] = val != 0 258 if err != nil { 259 return i, err 260 } 261 } 262 263 b.reader.Seek(b.byteoffset, io.SeekStart) 264 buf := arrow.Uint32Traits.CastToBytes(b.unpackBuf[:]) 265 blen := buflen * 8 266 for i < length { 267 // grab byte-aligned bits in a loop since it's more efficient than going 268 // bit by bit when you can grab 8 bools at a time. 269 unpackSize := utils.MinInt(blen, length-i) / 8 * 8 270 n, err := b.reader.Read(buf[:bitutil.BytesForBits(int64(unpackSize))]) 271 if err != nil { 272 return i, err 273 } 274 BytesToBools(buf[:n], out[i:]) 275 i += unpackSize 276 b.byteoffset += int64(n) 277 } 278 279 b.fillbuffer() 280 // grab the trailing bits 281 for ; i < length; i++ { 282 val, err := b.next(bits) 283 out[i] = val != 0 284 if err != nil { 285 return i, err 286 } 287 } 288 289 return i, nil 290 } 291 292 // GetBatch fills out by decoding values repeated from the stream that are encoded 293 // using bits as the number of bits per value. The values are expected to be bit packed 294 // so we will unpack the values to populate. 295 func (b *BitReader) GetBatch(bits uint, out []uint64) (int, error) { 296 // since we're unpacking into uint64 values, we can't support bits being 297 // larger than 64 here as that's the largest size value we're reading 298 if bits > 64 { 299 return 0, errors.New("must be 64 bits or less per read") 300 } 301 302 length := len(out) 303 304 i := 0 305 // read until we are byte aligned 306 for ; i < length && b.bitoffset != 0; i++ { 307 val, err := b.next(bits) 308 out[i] = val 309 if err != nil { 310 return i, err 311 } 312 } 313 314 b.reader.Seek(b.byteoffset, io.SeekStart) 315 for i < length { 316 // unpack groups of 32 bytes at a time into a buffer since it's more efficient 317 unpackSize := utils.MinInt(buflen, length-i) 318 numUnpacked := unpack32(b.reader, b.unpackBuf[:unpackSize], int(bits)) 319 if numUnpacked == 0 { 320 break 321 } 322 323 for k := 0; k < numUnpacked; k++ { 324 out[i+k] = uint64(b.unpackBuf[k]) 325 } 326 i += numUnpacked 327 b.byteoffset += int64(numUnpacked * int(bits) / 8) 328 } 329 330 b.fillbuffer() 331 // and then the remaining trailing values 332 for ; i < length; i++ { 333 val, err := b.next(bits) 334 out[i] = val 335 if err != nil { 336 return i, err 337 } 338 } 339 340 return i, nil 341 } 342 343 // GetValue returns a single value that is bit packed using width as the number of bits 344 // and returns false if there weren't enough bits remaining. 345 func (b *BitReader) GetValue(width int) (uint64, bool) { 346 v := make([]uint64, 1) 347 n, _ := b.GetBatch(uint(width), v) 348 return v[0], n == 1 349 }