github.com/apache/arrow/go/v14@v14.0.1/parquet/internal/utils/bit_reader.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package utils
    18  
    19  import (
    20  	"encoding/binary"
    21  	"errors"
    22  	"io"
    23  	"math"
    24  	"reflect"
    25  	"unsafe"
    26  
    27  	"github.com/apache/arrow/go/v14/arrow"
    28  	"github.com/apache/arrow/go/v14/arrow/bitutil"
    29  	"github.com/apache/arrow/go/v14/arrow/memory"
    30  	"github.com/apache/arrow/go/v14/internal/utils"
    31  )
    32  
    33  // masks for grabbing the trailing bits based on the number of trailing bits desired
    34  var trailingMask [64]uint64
    35  
    36  func init() {
    37  	// generate the masks at init so we don't have to hard code them.
    38  	for i := 0; i < 64; i++ {
    39  		trailingMask[i] = (math.MaxUint64 >> (64 - i))
    40  	}
    41  }
    42  
    43  // trailingBits returns a value constructed from the bits trailing bits of
    44  // the value v that is passed in. If bits >= 64, then we just return v.
    45  func trailingBits(v uint64, bits uint) uint64 {
    46  	if bits >= 64 {
    47  		return v
    48  	}
    49  	return v & trailingMask[bits]
    50  }
    51  
    52  // reader is a useful interface to define the functionality we need for implementation
    53  type reader interface {
    54  	io.Reader
    55  	io.ReaderAt
    56  	io.Seeker
    57  }
    58  
    59  // default buffer length
    60  const buflen = 1024
    61  
    62  // BitReader implements functionality for reading bits or bytes buffering up to a uint64
    63  // at a time from the reader in order to improve efficiency. It also provides
    64  // methods to read multiple bytes in one read such as encoded ints/values.
    65  //
    66  // This BitReader is the basis for the other utility classes like RLE decoding
    67  // and such, providing the necessary functions for interpreting the values.
    68  type BitReader struct {
    69  	reader     reader
    70  	buffer     uint64
    71  	byteoffset int64
    72  	bitoffset  uint
    73  	raw        [8]byte
    74  
    75  	unpackBuf [buflen]uint32
    76  }
    77  
    78  // NewBitReader takes in a reader that implements io.Reader, io.ReaderAt and io.Seeker
    79  // interfaces and returns a BitReader for use with various bit level manipulations.
    80  func NewBitReader(r reader) *BitReader {
    81  	return &BitReader{reader: r}
    82  }
    83  
    84  // CurOffset returns the current Byte offset into the data that the reader is at.
    85  func (b *BitReader) CurOffset() int64 {
    86  	return b.byteoffset + bitutil.BytesForBits(int64(b.bitoffset))
    87  }
    88  
    89  // Reset allows reusing a BitReader by setting a new reader and resetting the internal
    90  // state back to zeros.
    91  func (b *BitReader) Reset(r reader) {
    92  	b.reader = r
    93  	b.buffer = 0
    94  	b.byteoffset = 0
    95  	b.bitoffset = 0
    96  }
    97  
    98  // GetVlqInt reads a Vlq encoded int from the stream. The encoded value must start
    99  // at the beginning of a byte and this returns false if there weren't enough bytes
   100  // in the buffer or reader. This will call `ReadByte` which in turn retrieves byte
   101  // aligned values from the reader
   102  func (b *BitReader) GetVlqInt() (uint64, bool) {
   103  	tmp, err := binary.ReadUvarint(b)
   104  	if err != nil {
   105  		return 0, false
   106  	}
   107  	return tmp, true
   108  }
   109  
   110  // GetZigZagVlqInt reads a zigzag encoded integer, returning false if there weren't
   111  // enough bytes remaining.
   112  func (b *BitReader) GetZigZagVlqInt() (int64, bool) {
   113  	u, ok := b.GetVlqInt()
   114  	if !ok {
   115  		return 0, false
   116  	}
   117  
   118  	return int64(u>>1) ^ -int64(u&1), true
   119  }
   120  
   121  // ReadByte reads a single aligned byte from the underlying stream, or populating
   122  // error if there aren't enough bytes left.
   123  func (b *BitReader) ReadByte() (byte, error) {
   124  	var tmp byte
   125  	if ok := b.GetAligned(1, &tmp); !ok {
   126  		return 0, errors.New("failed to read byte")
   127  	}
   128  
   129  	return tmp, nil
   130  }
   131  
   132  // GetAligned reads nbytes from the underlying stream into the passed interface value.
   133  // Returning false if there aren't enough bytes remaining in the stream or if an invalid
   134  // type is passed. The bytes are read aligned to byte boundaries.
   135  //
   136  // v must be a pointer to a byte or sized uint type (*byte, *uint16, *uint32, *uint64).
   137  // encoded values are assumed to be little endian.
   138  func (b *BitReader) GetAligned(nbytes int, v interface{}) bool {
   139  	// figure out the number of bytes to represent v
   140  	typBytes := int(reflect.TypeOf(v).Elem().Size())
   141  	if nbytes > typBytes {
   142  		return false
   143  	}
   144  
   145  	bread := bitutil.BytesForBits(int64(b.bitoffset))
   146  
   147  	b.byteoffset += bread
   148  	n, err := b.reader.ReadAt(b.raw[:nbytes], b.byteoffset)
   149  	if err != nil && err != io.EOF {
   150  		return false
   151  	}
   152  	if n != nbytes {
   153  		return false
   154  	}
   155  	// zero pad the the bytes
   156  	memory.Set(b.raw[n:typBytes], 0)
   157  
   158  	switch v := v.(type) {
   159  	case *byte:
   160  		*v = b.raw[0]
   161  	case *uint64:
   162  		*v = binary.LittleEndian.Uint64(b.raw[:typBytes])
   163  	case *uint32:
   164  		*v = binary.LittleEndian.Uint32(b.raw[:typBytes])
   165  	case *uint16:
   166  		*v = binary.LittleEndian.Uint16(b.raw[:typBytes])
   167  	default:
   168  		return false
   169  	}
   170  
   171  	b.byteoffset += int64(nbytes)
   172  
   173  	b.bitoffset = 0
   174  	b.fillbuffer()
   175  	return true
   176  }
   177  
   178  // fillbuffer fills the uint64 buffer with bytes from the underlying stream
   179  func (b *BitReader) fillbuffer() error {
   180  	n, err := b.reader.ReadAt(b.raw[:], b.byteoffset)
   181  	if err != nil && n == 0 && err != io.EOF {
   182  		return err
   183  	}
   184  	for i := n; i < 8; i++ {
   185  		b.raw[i] = 0
   186  	}
   187  	b.buffer = binary.LittleEndian.Uint64(b.raw[:])
   188  	return nil
   189  }
   190  
   191  // next reads an integral value from the next bits in the buffer
   192  func (b *BitReader) next(bits uint) (v uint64, err error) {
   193  	v = trailingBits(b.buffer, b.bitoffset+bits) >> b.bitoffset
   194  	b.bitoffset += bits
   195  	// if we need more bits to get what was requested then refill the buffer
   196  	if b.bitoffset >= 64 {
   197  		b.byteoffset += 8
   198  		b.bitoffset -= 64
   199  		if err = b.fillbuffer(); err != nil {
   200  			return 0, err
   201  		}
   202  		v |= trailingBits(b.buffer, b.bitoffset) << (bits - b.bitoffset)
   203  	}
   204  	return
   205  }
   206  
   207  // GetBatchIndex is like GetBatch but for IndexType (used for dictionary decoding)
   208  func (b *BitReader) GetBatchIndex(bits uint, out []IndexType) (i int, err error) {
   209  	// IndexType is a 32-bit value so bits must be less than 32 when unpacking
   210  	// values using the bitreader.
   211  	if bits > 32 {
   212  		return 0, errors.New("must be 32 bits or less per read")
   213  	}
   214  
   215  	var val uint64
   216  
   217  	length := len(out)
   218  	// if we're not currently byte-aligned, read bits until we are byte-aligned.
   219  	for ; i < length && b.bitoffset != 0; i++ {
   220  		val, err = b.next(bits)
   221  		out[i] = IndexType(val)
   222  		if err != nil {
   223  			return
   224  		}
   225  	}
   226  
   227  	b.reader.Seek(b.byteoffset, io.SeekStart)
   228  	// grab as many 32 byte chunks as possible in one shot
   229  	if i < length { // IndexType should be a 32 bit value so we can do quick unpacking right into the output
   230  		numUnpacked := unpack32(b.reader, (*(*[]uint32)(unsafe.Pointer(&out)))[i:], int(bits))
   231  		i += numUnpacked
   232  		b.byteoffset += int64(numUnpacked * int(bits) / 8)
   233  	}
   234  
   235  	// re-fill our buffer just in case.
   236  	b.fillbuffer()
   237  	// grab the remaining values that aren't 32 byte aligned
   238  	for ; i < length; i++ {
   239  		val, err = b.next(bits)
   240  		out[i] = IndexType(val)
   241  		if err != nil {
   242  			break
   243  		}
   244  	}
   245  	return
   246  }
   247  
   248  // GetBatchBools is like GetBatch but optimized for reading bits as boolean values
   249  func (b *BitReader) GetBatchBools(out []bool) (int, error) {
   250  	bits := uint(1)
   251  	length := len(out)
   252  
   253  	i := 0
   254  	// read until we are byte-aligned
   255  	for ; i < length && b.bitoffset != 0; i++ {
   256  		val, err := b.next(bits)
   257  		out[i] = val != 0
   258  		if err != nil {
   259  			return i, err
   260  		}
   261  	}
   262  
   263  	b.reader.Seek(b.byteoffset, io.SeekStart)
   264  	buf := arrow.Uint32Traits.CastToBytes(b.unpackBuf[:])
   265  	blen := buflen * 8
   266  	for i < length {
   267  		// grab byte-aligned bits in a loop since it's more efficient than going
   268  		// bit by bit when you can grab 8 bools at a time.
   269  		unpackSize := utils.MinInt(blen, length-i) / 8 * 8
   270  		n, err := b.reader.Read(buf[:bitutil.BytesForBits(int64(unpackSize))])
   271  		if err != nil {
   272  			return i, err
   273  		}
   274  		BytesToBools(buf[:n], out[i:])
   275  		i += unpackSize
   276  		b.byteoffset += int64(n)
   277  	}
   278  
   279  	b.fillbuffer()
   280  	// grab the trailing bits
   281  	for ; i < length; i++ {
   282  		val, err := b.next(bits)
   283  		out[i] = val != 0
   284  		if err != nil {
   285  			return i, err
   286  		}
   287  	}
   288  
   289  	return i, nil
   290  }
   291  
   292  // GetBatch fills out by decoding values repeated from the stream that are encoded
   293  // using bits as the number of bits per value. The values are expected to be bit packed
   294  // so we will unpack the values to populate.
   295  func (b *BitReader) GetBatch(bits uint, out []uint64) (int, error) {
   296  	// since we're unpacking into uint64 values, we can't support bits being
   297  	// larger than 64 here as that's the largest size value we're reading
   298  	if bits > 64 {
   299  		return 0, errors.New("must be 64 bits or less per read")
   300  	}
   301  
   302  	length := len(out)
   303  
   304  	i := 0
   305  	// read until we are byte aligned
   306  	for ; i < length && b.bitoffset != 0; i++ {
   307  		val, err := b.next(bits)
   308  		out[i] = val
   309  		if err != nil {
   310  			return i, err
   311  		}
   312  	}
   313  
   314  	b.reader.Seek(b.byteoffset, io.SeekStart)
   315  	for i < length {
   316  		// unpack groups of 32 bytes at a time into a buffer since it's more efficient
   317  		unpackSize := utils.MinInt(buflen, length-i)
   318  		numUnpacked := unpack32(b.reader, b.unpackBuf[:unpackSize], int(bits))
   319  		if numUnpacked == 0 {
   320  			break
   321  		}
   322  
   323  		for k := 0; k < numUnpacked; k++ {
   324  			out[i+k] = uint64(b.unpackBuf[k])
   325  		}
   326  		i += numUnpacked
   327  		b.byteoffset += int64(numUnpacked * int(bits) / 8)
   328  	}
   329  
   330  	b.fillbuffer()
   331  	// and then the remaining trailing values
   332  	for ; i < length; i++ {
   333  		val, err := b.next(bits)
   334  		out[i] = val
   335  		if err != nil {
   336  			return i, err
   337  		}
   338  	}
   339  
   340  	return i, nil
   341  }
   342  
   343  // GetValue returns a single value that is bit packed using width as the number of bits
   344  // and returns false if there weren't enough bits remaining.
   345  func (b *BitReader) GetValue(width int) (uint64, bool) {
   346  	v := make([]uint64, 1)
   347  	n, _ := b.GetBatch(uint(width), v)
   348  	return v[0], n == 1
   349  }