github.com/apache/arrow/go/v14@v14.0.1/parquet/pqarrow/column_readers.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow
    18  
    19  import (
    20  	"encoding/binary"
    21  	"errors"
    22  	"fmt"
    23  	"reflect"
    24  	"sync"
    25  	"sync/atomic"
    26  	"time"
    27  	"unsafe"
    28  
    29  	"github.com/apache/arrow/go/v14/arrow"
    30  	"github.com/apache/arrow/go/v14/arrow/array"
    31  	"github.com/apache/arrow/go/v14/arrow/bitutil"
    32  	"github.com/apache/arrow/go/v14/arrow/decimal128"
    33  	"github.com/apache/arrow/go/v14/arrow/decimal256"
    34  	"github.com/apache/arrow/go/v14/arrow/memory"
    35  	"github.com/apache/arrow/go/v14/internal/utils"
    36  	"github.com/apache/arrow/go/v14/parquet"
    37  	"github.com/apache/arrow/go/v14/parquet/file"
    38  	"github.com/apache/arrow/go/v14/parquet/schema"
    39  	"golang.org/x/sync/errgroup"
    40  )
    41  
    42  // column reader for leaf columns (non-nested)
    43  type leafReader struct {
    44  	out       *arrow.Chunked
    45  	rctx      *readerCtx
    46  	field     *arrow.Field
    47  	input     *columnIterator
    48  	descr     *schema.Column
    49  	recordRdr file.RecordReader
    50  	props     ArrowReadProperties
    51  
    52  	refCount int64
    53  }
    54  
    55  func newLeafReader(rctx *readerCtx, field *arrow.Field, input *columnIterator, leafInfo file.LevelInfo, props ArrowReadProperties, bufferPool *sync.Pool) (*ColumnReader, error) {
    56  	ret := &leafReader{
    57  		rctx:      rctx,
    58  		field:     field,
    59  		input:     input,
    60  		descr:     input.Descr(),
    61  		recordRdr: file.NewRecordReader(input.Descr(), leafInfo, field.Type, rctx.mem, bufferPool),
    62  		props:     props,
    63  		refCount:  1,
    64  	}
    65  	err := ret.nextRowGroup()
    66  	return &ColumnReader{ret}, err
    67  }
    68  
    69  func (lr *leafReader) Retain() {
    70  	atomic.AddInt64(&lr.refCount, 1)
    71  }
    72  
    73  func (lr *leafReader) Release() {
    74  	if atomic.AddInt64(&lr.refCount, -1) == 0 {
    75  		lr.releaseOut()
    76  		if lr.recordRdr != nil {
    77  			lr.recordRdr.Release()
    78  			lr.recordRdr = nil
    79  		}
    80  	}
    81  }
    82  
    83  func (lr *leafReader) GetDefLevels() ([]int16, error) {
    84  	return lr.recordRdr.DefLevels()[:int(lr.recordRdr.LevelsPos())], nil
    85  }
    86  
    87  func (lr *leafReader) GetRepLevels() ([]int16, error) {
    88  	return lr.recordRdr.RepLevels()[:int(lr.recordRdr.LevelsPos())], nil
    89  }
    90  
    91  func (lr *leafReader) IsOrHasRepeatedChild() bool { return false }
    92  
    93  func (lr *leafReader) LoadBatch(nrecords int64) (err error) {
    94  	lr.releaseOut()
    95  	lr.recordRdr.Reset()
    96  
    97  	if err := lr.recordRdr.Reserve(nrecords); err != nil {
    98  		return err
    99  	}
   100  	for nrecords > 0 {
   101  		if !lr.recordRdr.HasMore() {
   102  			break
   103  		}
   104  		numRead, err := lr.recordRdr.ReadRecords(nrecords)
   105  		if err != nil {
   106  			return err
   107  		}
   108  		nrecords -= numRead
   109  		if numRead == 0 {
   110  			if err = lr.nextRowGroup(); err != nil {
   111  				return err
   112  			}
   113  		}
   114  	}
   115  	lr.out, err = transferColumnData(lr.recordRdr, lr.field.Type, lr.descr)
   116  	return
   117  }
   118  
   119  func (lr *leafReader) BuildArray(int64) (*arrow.Chunked, error) {
   120  	return lr.clearOut(), nil
   121  }
   122  
   123  // releaseOut will clear lr.out as well as release it if it wasn't nil
   124  func (lr *leafReader) releaseOut() {
   125  	if out := lr.clearOut(); out != nil {
   126  		out.Release()
   127  	}
   128  }
   129  
   130  // clearOut will clear lt.out and return the old value
   131  func (lr *leafReader) clearOut() (out *arrow.Chunked) {
   132  	out, lr.out = lr.out, nil
   133  	return out
   134  }
   135  
   136  func (lr *leafReader) Field() *arrow.Field { return lr.field }
   137  
   138  func (lr *leafReader) nextRowGroup() error {
   139  	pr, err := lr.input.NextChunk()
   140  	if err != nil {
   141  		return err
   142  	}
   143  	lr.recordRdr.SetPageReader(pr)
   144  	return nil
   145  }
   146  
   147  // column reader for struct arrays, has readers for each child which could
   148  // themselves be nested or leaf columns.
   149  type structReader struct {
   150  	rctx             *readerCtx
   151  	filtered         *arrow.Field
   152  	levelInfo        file.LevelInfo
   153  	children         []*ColumnReader
   154  	defRepLevelChild *ColumnReader
   155  	hasRepeatedChild bool
   156  	props            ArrowReadProperties
   157  
   158  	refCount int64
   159  }
   160  
   161  func (sr *structReader) Retain() {
   162  	atomic.AddInt64(&sr.refCount, 1)
   163  }
   164  
   165  func (sr *structReader) Release() {
   166  	if atomic.AddInt64(&sr.refCount, -1) == 0 {
   167  		if sr.defRepLevelChild != nil {
   168  			sr.defRepLevelChild.Release()
   169  			sr.defRepLevelChild = nil
   170  		}
   171  		for _, c := range sr.children {
   172  			c.Release()
   173  		}
   174  		sr.children = nil
   175  	}
   176  }
   177  
   178  func newStructReader(rctx *readerCtx, filtered *arrow.Field, levelInfo file.LevelInfo, children []*ColumnReader, props ArrowReadProperties) *ColumnReader {
   179  	ret := &structReader{
   180  		rctx:      rctx,
   181  		filtered:  filtered,
   182  		levelInfo: levelInfo,
   183  		children:  children,
   184  		props:     props,
   185  		refCount:  1,
   186  	}
   187  
   188  	// there could be a mix of children some might be repeated and some might not be
   189  	// if possible use one that isn't since that will be guaranteed to have the least
   190  	// number of levels to reconstruct a nullable bitmap
   191  	for _, child := range children {
   192  		if !child.IsOrHasRepeatedChild() {
   193  			ret.defRepLevelChild = child
   194  			break
   195  		}
   196  	}
   197  
   198  	if ret.defRepLevelChild == nil {
   199  		ret.defRepLevelChild = children[0]
   200  		ret.hasRepeatedChild = true
   201  	}
   202  	ret.defRepLevelChild.Retain()
   203  	return &ColumnReader{ret}
   204  }
   205  
   206  func (sr *structReader) IsOrHasRepeatedChild() bool { return sr.hasRepeatedChild }
   207  
   208  func (sr *structReader) GetDefLevels() ([]int16, error) {
   209  	if len(sr.children) == 0 {
   210  		return nil, errors.New("struct reader has no children")
   211  	}
   212  
   213  	// this method should only be called when this struct or one of its parents
   214  	// are optional/repeated or has a repeated child
   215  	// meaning all children must have rep/def levels associated with them
   216  	return sr.defRepLevelChild.GetDefLevels()
   217  }
   218  
   219  func (sr *structReader) GetRepLevels() ([]int16, error) {
   220  	if len(sr.children) == 0 {
   221  		return nil, errors.New("struct reader has no children")
   222  	}
   223  
   224  	// this method should only be called when this struct or one of its parents
   225  	// are optional/repeated or has a repeated child
   226  	// meaning all children must have rep/def levels associated with them
   227  	return sr.defRepLevelChild.GetRepLevels()
   228  }
   229  
   230  func (sr *structReader) LoadBatch(nrecords int64) error {
   231  	// Load batches in parallel
   232  	// When reading structs with large numbers of columns, the serial load is very slow.
   233  	// This is especially true when reading Cloud Storage. Loading concurrently
   234  	// greatly improves performance.
   235  	g := new(errgroup.Group)
   236  	if !sr.props.Parallel {
   237  		g.SetLimit(1)
   238  	}
   239  	for _, rdr := range sr.children {
   240  		rdr := rdr
   241  		g.Go(func() error {
   242  			return rdr.LoadBatch(nrecords)
   243  		})
   244  	}
   245  
   246  	return g.Wait()
   247  }
   248  
   249  func (sr *structReader) Field() *arrow.Field { return sr.filtered }
   250  
   251  func (sr *structReader) BuildArray(lenBound int64) (*arrow.Chunked, error) {
   252  	validityIO := file.ValidityBitmapInputOutput{
   253  		ReadUpperBound: lenBound,
   254  		Read:           lenBound,
   255  	}
   256  
   257  	var nullBitmap *memory.Buffer
   258  
   259  	if lenBound > 0 && (sr.hasRepeatedChild || sr.filtered.Nullable) {
   260  		nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
   261  		nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
   262  		defer nullBitmap.Release()
   263  		validityIO.ValidBits = nullBitmap.Bytes()
   264  		defLevels, err := sr.GetDefLevels()
   265  		if err != nil {
   266  			return nil, err
   267  		}
   268  
   269  		if sr.hasRepeatedChild {
   270  			repLevels, err := sr.GetRepLevels()
   271  			if err != nil {
   272  				return nil, err
   273  			}
   274  
   275  			if err := file.DefRepLevelsToBitmap(defLevels, repLevels, sr.levelInfo, &validityIO); err != nil {
   276  				return nil, err
   277  			}
   278  		} else {
   279  			file.DefLevelsToBitmap(defLevels, sr.levelInfo, &validityIO)
   280  		}
   281  	}
   282  
   283  	if nullBitmap != nil {
   284  		nullBitmap.Resize(int(bitutil.BytesForBits(validityIO.Read)))
   285  	}
   286  
   287  	childArrData := make([]arrow.ArrayData, len(sr.children))
   288  	defer releaseArrayData(childArrData)
   289  	// gather children arrays and def levels
   290  	for i, child := range sr.children {
   291  		field, err := child.BuildArray(lenBound)
   292  		if err != nil {
   293  			return nil, err
   294  		}
   295  
   296  		childArrData[i], err = chunksToSingle(field)
   297  		field.Release() // release field before checking
   298  		if err != nil {
   299  			return nil, err
   300  		}
   301  	}
   302  
   303  	if !sr.filtered.Nullable && !sr.hasRepeatedChild {
   304  		validityIO.Read = int64(childArrData[0].Len())
   305  	}
   306  
   307  	buffers := make([]*memory.Buffer, 1)
   308  	if validityIO.NullCount > 0 {
   309  		buffers[0] = nullBitmap
   310  	}
   311  
   312  	data := array.NewData(sr.filtered.Type, int(validityIO.Read), buffers, childArrData, int(validityIO.NullCount), 0)
   313  	defer data.Release()
   314  	arr := array.NewStructData(data)
   315  	defer arr.Release()
   316  	return arrow.NewChunked(sr.filtered.Type, []arrow.Array{arr}), nil
   317  }
   318  
   319  // column reader for repeated columns specifically for list arrays
   320  type listReader struct {
   321  	rctx     *readerCtx
   322  	field    *arrow.Field
   323  	info     file.LevelInfo
   324  	itemRdr  *ColumnReader
   325  	props    ArrowReadProperties
   326  	refCount int64
   327  }
   328  
   329  func newListReader(rctx *readerCtx, field *arrow.Field, info file.LevelInfo, childRdr *ColumnReader, props ArrowReadProperties) *ColumnReader {
   330  	childRdr.Retain()
   331  	return &ColumnReader{&listReader{rctx, field, info, childRdr, props, 1}}
   332  }
   333  
   334  func (lr *listReader) Retain() {
   335  	atomic.AddInt64(&lr.refCount, 1)
   336  }
   337  
   338  func (lr *listReader) Release() {
   339  	if atomic.AddInt64(&lr.refCount, -1) == 0 {
   340  		if lr.itemRdr != nil {
   341  			lr.itemRdr.Release()
   342  			lr.itemRdr = nil
   343  		}
   344  	}
   345  }
   346  
   347  func (lr *listReader) GetDefLevels() ([]int16, error) {
   348  	return lr.itemRdr.GetDefLevels()
   349  }
   350  
   351  func (lr *listReader) GetRepLevels() ([]int16, error) {
   352  	return lr.itemRdr.GetRepLevels()
   353  }
   354  
   355  func (lr *listReader) Field() *arrow.Field { return lr.field }
   356  
   357  func (lr *listReader) IsOrHasRepeatedChild() bool { return true }
   358  
   359  func (lr *listReader) LoadBatch(nrecords int64) error {
   360  	return lr.itemRdr.LoadBatch(nrecords)
   361  }
   362  
   363  func (lr *listReader) BuildArray(lenBound int64) (*arrow.Chunked, error) {
   364  	var (
   365  		defLevels      []int16
   366  		repLevels      []int16
   367  		err            error
   368  		validityBuffer *memory.Buffer
   369  	)
   370  
   371  	if defLevels, err = lr.itemRdr.GetDefLevels(); err != nil {
   372  		return nil, err
   373  	}
   374  	if repLevels, err = lr.itemRdr.GetRepLevels(); err != nil {
   375  		return nil, err
   376  	}
   377  
   378  	validityIO := file.ValidityBitmapInputOutput{ReadUpperBound: lenBound}
   379  	if lr.field.Nullable {
   380  		validityBuffer = memory.NewResizableBuffer(lr.rctx.mem)
   381  		validityBuffer.Resize(int(bitutil.BytesForBits(lenBound)))
   382  		defer validityBuffer.Release()
   383  		validityIO.ValidBits = validityBuffer.Bytes()
   384  	}
   385  	offsetsBuffer := memory.NewResizableBuffer(lr.rctx.mem)
   386  	offsetsBuffer.Resize(arrow.Int32Traits.BytesRequired(int(lenBound) + 1))
   387  	defer offsetsBuffer.Release()
   388  
   389  	offsetData := arrow.Int32Traits.CastFromBytes(offsetsBuffer.Bytes())
   390  	if err = file.DefRepLevelsToListInfo(defLevels, repLevels, lr.info, &validityIO, offsetData); err != nil {
   391  		return nil, err
   392  	}
   393  
   394  	// if the parent (itemRdr) has nulls and is a nested type like list
   395  	// then we need BuildArray to account for that with the number of
   396  	// definition levels when building out the bitmap. So the upper bound
   397  	// to make sure we have the space for is the worst case scenario,
   398  	// the upper bound is the value of the last offset + the nullcount
   399  	arr, err := lr.itemRdr.BuildArray(int64(offsetData[int(validityIO.Read)]) + validityIO.NullCount)
   400  	if err != nil {
   401  		return nil, err
   402  	}
   403  	defer arr.Release()
   404  
   405  	// resize to actual number of elems returned
   406  	offsetsBuffer.Resize(arrow.Int32Traits.BytesRequired(int(validityIO.Read) + 1))
   407  	if validityBuffer != nil {
   408  		validityBuffer.Resize(int(bitutil.BytesForBits(validityIO.Read)))
   409  	}
   410  
   411  	item, err := chunksToSingle(arr)
   412  	if err != nil {
   413  		return nil, err
   414  	}
   415  	defer item.Release()
   416  
   417  	buffers := []*memory.Buffer{nil, offsetsBuffer}
   418  	if validityIO.NullCount > 0 {
   419  		buffers[0] = validityBuffer
   420  	}
   421  
   422  	data := array.NewData(lr.field.Type, int(validityIO.Read), buffers, []arrow.ArrayData{item}, int(validityIO.NullCount), 0)
   423  	defer data.Release()
   424  	if lr.field.Type.ID() == arrow.FIXED_SIZE_LIST {
   425  		defer data.Buffers()[1].Release()
   426  		listSize := lr.field.Type.(*arrow.FixedSizeListType).Len()
   427  		for x := 1; x < data.Len(); x++ {
   428  			size := offsetData[x] - offsetData[x-1]
   429  			if size != listSize {
   430  				return nil, fmt.Errorf("expected all lists to be of size=%d, but index %d had size=%d", listSize, x, size)
   431  			}
   432  		}
   433  		data.Buffers()[1] = nil
   434  	}
   435  	out := array.MakeFromData(data)
   436  	defer out.Release()
   437  	return arrow.NewChunked(lr.field.Type, []arrow.Array{out}), nil
   438  }
   439  
   440  // column reader logic for fixed size lists instead of variable length ones.
   441  type fixedSizeListReader struct {
   442  	listReader
   443  }
   444  
   445  func newFixedSizeListReader(rctx *readerCtx, field *arrow.Field, info file.LevelInfo, childRdr *ColumnReader, props ArrowReadProperties) *ColumnReader {
   446  	childRdr.Retain()
   447  	return &ColumnReader{&fixedSizeListReader{listReader{rctx, field, info, childRdr, props, 1}}}
   448  }
   449  
   450  // helper function to combine chunks into a single array.
   451  //
   452  // nested data conversion for chunked array outputs not yet implemented
   453  func chunksToSingle(chunked *arrow.Chunked) (arrow.ArrayData, error) {
   454  	switch len(chunked.Chunks()) {
   455  	case 0:
   456  		return array.NewData(chunked.DataType(), 0, []*memory.Buffer{nil, nil}, nil, 0, 0), nil
   457  	case 1:
   458  		data := chunked.Chunk(0).Data()
   459  		data.Retain() // we pass control to the caller
   460  		return data, nil
   461  	default: // if an item reader yields a chunked array, this is not yet implemented
   462  		return nil, arrow.ErrNotImplemented
   463  	}
   464  }
   465  
   466  // create a chunked arrow array from the raw record data
   467  func transferColumnData(rdr file.RecordReader, valueType arrow.DataType, descr *schema.Column) (*arrow.Chunked, error) {
   468  	dt := valueType
   469  	if valueType.ID() == arrow.EXTENSION {
   470  		dt = valueType.(arrow.ExtensionType).StorageType()
   471  	}
   472  
   473  	var data arrow.ArrayData
   474  	switch dt.ID() {
   475  	case arrow.DICTIONARY:
   476  		return transferDictionary(rdr, valueType), nil
   477  	case arrow.NULL:
   478  		return arrow.NewChunked(arrow.Null, []arrow.Array{array.NewNull(rdr.ValuesWritten())}), nil
   479  	case arrow.INT32, arrow.INT64, arrow.FLOAT32, arrow.FLOAT64:
   480  		data = transferZeroCopy(rdr, valueType) // can just reference the raw data without copying
   481  	case arrow.BOOL:
   482  		data = transferBool(rdr)
   483  	case arrow.UINT8,
   484  		arrow.UINT16,
   485  		arrow.UINT32,
   486  		arrow.UINT64,
   487  		arrow.INT8,
   488  		arrow.INT16,
   489  		arrow.DATE32,
   490  		arrow.TIME32,
   491  		arrow.TIME64:
   492  		data = transferInt(rdr, valueType)
   493  	case arrow.DATE64:
   494  		data = transferDate64(rdr, valueType)
   495  	case arrow.FIXED_SIZE_BINARY, arrow.BINARY, arrow.STRING, arrow.LARGE_BINARY, arrow.LARGE_STRING:
   496  		return transferBinary(rdr, valueType), nil
   497  	case arrow.DECIMAL, arrow.DECIMAL256:
   498  		switch descr.PhysicalType() {
   499  		case parquet.Types.Int32, parquet.Types.Int64:
   500  			data = transferDecimalInteger(rdr, valueType)
   501  		case parquet.Types.ByteArray, parquet.Types.FixedLenByteArray:
   502  			return transferDecimalBytes(rdr.(file.BinaryRecordReader), valueType)
   503  		default:
   504  			return nil, errors.New("physical type for decimal128/decimal256 must be int32, int64, bytearray or fixed len byte array")
   505  		}
   506  	case arrow.TIMESTAMP:
   507  		tstype := valueType.(*arrow.TimestampType)
   508  		switch tstype.Unit {
   509  		case arrow.Millisecond, arrow.Microsecond:
   510  			data = transferZeroCopy(rdr, valueType)
   511  		case arrow.Nanosecond:
   512  			if descr.PhysicalType() == parquet.Types.Int96 {
   513  				data = transferInt96(rdr, valueType)
   514  			} else {
   515  				data = transferZeroCopy(rdr, valueType)
   516  			}
   517  		default:
   518  			return nil, errors.New("time unit not supported")
   519  		}
   520  	default:
   521  		return nil, fmt.Errorf("no support for reading columns of type: %s", valueType.Name())
   522  	}
   523  
   524  	defer data.Release()
   525  	arr := array.MakeFromData(data)
   526  	defer arr.Release()
   527  	return arrow.NewChunked(valueType, []arrow.Array{arr}), nil
   528  }
   529  
   530  func transferZeroCopy(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData {
   531  	bitmap := rdr.ReleaseValidBits()
   532  	values := rdr.ReleaseValues()
   533  	defer func() {
   534  		if bitmap != nil {
   535  			bitmap.Release()
   536  		}
   537  		if values != nil {
   538  			values.Release()
   539  		}
   540  	}()
   541  
   542  	return array.NewData(dt, rdr.ValuesWritten(),
   543  		[]*memory.Buffer{bitmap, values},
   544  		nil, int(rdr.NullCount()), 0)
   545  }
   546  
   547  func transferBinary(rdr file.RecordReader, dt arrow.DataType) *arrow.Chunked {
   548  	brdr := rdr.(file.BinaryRecordReader)
   549  	if brdr.ReadDictionary() {
   550  		return transferDictionary(brdr, &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32, ValueType: dt})
   551  	}
   552  	chunks := brdr.GetBuilderChunks()
   553  	defer releaseArrays(chunks)
   554  
   555  	switch dt := dt.(type) {
   556  	case arrow.ExtensionType:
   557  		for idx, chunk := range chunks {
   558  			chunks[idx] = array.NewExtensionArrayWithStorage(dt, chunk)
   559  			chunk.Release()
   560  		}
   561  	case *arrow.StringType, *arrow.LargeStringType:
   562  		for idx, chunk := range chunks {
   563  			chunks[idx] = array.MakeFromData(chunk.Data())
   564  			chunk.Release()
   565  		}
   566  	}
   567  	return arrow.NewChunked(dt, chunks)
   568  }
   569  
   570  func transferInt(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData {
   571  	var (
   572  		output reflect.Value
   573  	)
   574  
   575  	signed := true
   576  	// create buffer for proper type since parquet only has int32 and int64
   577  	// physical representations, but we want the correct type representation
   578  	// for Arrow's in memory buffer.
   579  	data := make([]byte, rdr.ValuesWritten()*int(bitutil.BytesForBits(int64(dt.(arrow.FixedWidthDataType).BitWidth()))))
   580  	switch dt.ID() {
   581  	case arrow.INT8:
   582  		output = reflect.ValueOf(arrow.Int8Traits.CastFromBytes(data))
   583  	case arrow.UINT8:
   584  		signed = false
   585  		output = reflect.ValueOf(arrow.Uint8Traits.CastFromBytes(data))
   586  	case arrow.INT16:
   587  		output = reflect.ValueOf(arrow.Int16Traits.CastFromBytes(data))
   588  	case arrow.UINT16:
   589  		signed = false
   590  		output = reflect.ValueOf(arrow.Uint16Traits.CastFromBytes(data))
   591  	case arrow.UINT32:
   592  		signed = false
   593  		output = reflect.ValueOf(arrow.Uint32Traits.CastFromBytes(data))
   594  	case arrow.UINT64:
   595  		signed = false
   596  		output = reflect.ValueOf(arrow.Uint64Traits.CastFromBytes(data))
   597  	case arrow.DATE32:
   598  		output = reflect.ValueOf(arrow.Date32Traits.CastFromBytes(data))
   599  	case arrow.TIME32:
   600  		output = reflect.ValueOf(arrow.Time32Traits.CastFromBytes(data))
   601  	case arrow.TIME64:
   602  		output = reflect.ValueOf(arrow.Time64Traits.CastFromBytes(data))
   603  	}
   604  
   605  	length := rdr.ValuesWritten()
   606  	// copy the values semantically with the correct types
   607  	switch rdr.Type() {
   608  	case parquet.Types.Int32:
   609  		values := arrow.Int32Traits.CastFromBytes(rdr.Values())
   610  		if signed {
   611  			for idx, v := range values[:length] {
   612  				output.Index(idx).SetInt(int64(v))
   613  			}
   614  		} else {
   615  			for idx, v := range values[:length] {
   616  				output.Index(idx).SetUint(uint64(v))
   617  			}
   618  		}
   619  	case parquet.Types.Int64:
   620  		values := arrow.Int64Traits.CastFromBytes(rdr.Values())
   621  		if signed {
   622  			for idx, v := range values[:length] {
   623  				output.Index(idx).SetInt(v)
   624  			}
   625  		} else {
   626  			for idx, v := range values[:length] {
   627  				output.Index(idx).SetUint(uint64(v))
   628  			}
   629  		}
   630  	}
   631  
   632  	bitmap := rdr.ReleaseValidBits()
   633  	if bitmap != nil {
   634  		defer bitmap.Release()
   635  	}
   636  
   637  	return array.NewData(dt, rdr.ValuesWritten(), []*memory.Buffer{
   638  		bitmap, memory.NewBufferBytes(data),
   639  	}, nil, int(rdr.NullCount()), 0)
   640  }
   641  
   642  func transferBool(rdr file.RecordReader) arrow.ArrayData {
   643  	// TODO(mtopol): optimize this so we don't convert bitmap to []bool back to bitmap
   644  	length := rdr.ValuesWritten()
   645  	data := make([]byte, int(bitutil.BytesForBits(int64(length))))
   646  	bytedata := rdr.Values()
   647  	values := *(*[]bool)(unsafe.Pointer(&bytedata))
   648  
   649  	for idx, v := range values[:length] {
   650  		if v {
   651  			bitutil.SetBit(data, idx)
   652  		}
   653  	}
   654  
   655  	bitmap := rdr.ReleaseValidBits()
   656  	if bitmap != nil {
   657  		defer bitmap.Release()
   658  	}
   659  	bb := memory.NewBufferBytes(data)
   660  	defer bb.Release()
   661  	return array.NewData(&arrow.BooleanType{}, length, []*memory.Buffer{
   662  		bitmap, bb,
   663  	}, nil, int(rdr.NullCount()), 0)
   664  }
   665  
   666  var milliPerDay = time.Duration(24 * time.Hour).Milliseconds()
   667  
   668  // parquet equivalent for date64 is a 32-bit integer of the number of days
   669  // since the epoch. Convert each value to milliseconds for date64
   670  func transferDate64(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData {
   671  	length := rdr.ValuesWritten()
   672  	values := arrow.Int32Traits.CastFromBytes(rdr.Values())
   673  
   674  	data := make([]byte, arrow.Int64Traits.BytesRequired(length))
   675  	out := arrow.Int64Traits.CastFromBytes(data)
   676  	for idx, val := range values[:length] {
   677  		out[idx] = int64(val) * milliPerDay
   678  	}
   679  
   680  	bitmap := rdr.ReleaseValidBits()
   681  	if bitmap != nil {
   682  		defer bitmap.Release()
   683  	}
   684  	return array.NewData(dt, length, []*memory.Buffer{
   685  		bitmap, memory.NewBufferBytes(data),
   686  	}, nil, int(rdr.NullCount()), 0)
   687  }
   688  
   689  // coerce int96 to nanosecond timestamp
   690  func transferInt96(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData {
   691  	length := rdr.ValuesWritten()
   692  	values := parquet.Int96Traits.CastFromBytes(rdr.Values())
   693  
   694  	data := make([]byte, arrow.Int64SizeBytes*length)
   695  	out := arrow.Int64Traits.CastFromBytes(data)
   696  
   697  	for idx, val := range values[:length] {
   698  		if binary.LittleEndian.Uint32(val[8:]) == 0 {
   699  			out[idx] = 0
   700  		} else {
   701  			out[idx] = val.ToTime().UnixNano()
   702  		}
   703  	}
   704  
   705  	bitmap := rdr.ReleaseValidBits()
   706  	if bitmap != nil {
   707  		defer bitmap.Release()
   708  	}
   709  	return array.NewData(dt, length, []*memory.Buffer{
   710  		bitmap, memory.NewBufferBytes(data),
   711  	}, nil, int(rdr.NullCount()), 0)
   712  }
   713  
   714  // convert physical integer storage of a decimal logical type to a decimal128 typed array
   715  func transferDecimalInteger(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData {
   716  	length := rdr.ValuesWritten()
   717  
   718  	var values reflect.Value
   719  	switch rdr.Type() {
   720  	case parquet.Types.Int32:
   721  		values = reflect.ValueOf(arrow.Int32Traits.CastFromBytes(rdr.Values())[:length])
   722  	case parquet.Types.Int64:
   723  		values = reflect.ValueOf(arrow.Int64Traits.CastFromBytes(rdr.Values())[:length])
   724  	}
   725  
   726  	var data []byte
   727  	switch dt.ID() {
   728  	case arrow.DECIMAL128:
   729  		data = make([]byte, arrow.Decimal128Traits.BytesRequired(length))
   730  		out := arrow.Decimal128Traits.CastFromBytes(data)
   731  		for i := 0; i < values.Len(); i++ {
   732  			out[i] = decimal128.FromI64(values.Index(i).Int())
   733  		}
   734  	case arrow.DECIMAL256:
   735  		data = make([]byte, arrow.Decimal256Traits.BytesRequired(length))
   736  		out := arrow.Decimal256Traits.CastFromBytes(data)
   737  		for i := 0; i < values.Len(); i++ {
   738  			out[i] = decimal256.FromI64(values.Index(i).Int())
   739  		}
   740  	}
   741  
   742  	var nullmap *memory.Buffer
   743  	if rdr.NullCount() > 0 {
   744  		nullmap = rdr.ReleaseValidBits()
   745  		defer nullmap.Release()
   746  	}
   747  	return array.NewData(dt, length, []*memory.Buffer{
   748  		nullmap, memory.NewBufferBytes(data),
   749  	}, nil, int(rdr.NullCount()), 0)
   750  }
   751  
   752  func uint64FromBigEndianShifted(buf []byte) uint64 {
   753  	var (
   754  		bytes [8]byte
   755  	)
   756  	copy(bytes[8-len(buf):], buf)
   757  	return binary.BigEndian.Uint64(bytes[:])
   758  }
   759  
   760  // parquet's defined encoding for decimal data is for it to be written as big
   761  // endian bytes, so convert a bit endian byte order to a decimal128
   762  func bigEndianToDecimal128(buf []byte) (decimal128.Num, error) {
   763  	const (
   764  		minDecimalBytes = 1
   765  		maxDecimalBytes = 16
   766  	)
   767  
   768  	if len(buf) < minDecimalBytes || len(buf) > maxDecimalBytes {
   769  		return decimal128.Num{}, fmt.Errorf("length of byte array passed to bigEndianToDecimal128 was %d but must be between %d and %d",
   770  			len(buf), minDecimalBytes, maxDecimalBytes)
   771  	}
   772  
   773  	// bytes are big endian so first byte is MSB and holds the sign bit
   774  	isNeg := int8(buf[0]) < 0
   775  
   776  	// 1. extract high bits
   777  	highBitsOffset := utils.MaxInt(0, len(buf)-8)
   778  	var (
   779  		highBits uint64
   780  		lowBits  uint64
   781  		hi       int64
   782  		lo       int64
   783  	)
   784  	highBits = uint64FromBigEndianShifted(buf[:highBitsOffset])
   785  
   786  	if highBitsOffset == 8 {
   787  		hi = int64(highBits)
   788  	} else {
   789  		if isNeg && len(buf) < maxDecimalBytes {
   790  			hi = -1
   791  		}
   792  
   793  		hi = int64(uint64(hi) << (uint64(highBitsOffset) * 8))
   794  		hi |= int64(highBits)
   795  	}
   796  
   797  	// 2. extract lower bits
   798  	lowBitsOffset := utils.MinInt(len(buf), 8)
   799  	lowBits = uint64FromBigEndianShifted(buf[highBitsOffset:])
   800  
   801  	if lowBitsOffset == 8 {
   802  		lo = int64(lowBits)
   803  	} else {
   804  		if isNeg && len(buf) < 8 {
   805  			lo = -1
   806  		}
   807  
   808  		lo = int64(uint64(lo) << (uint64(lowBitsOffset) * 8))
   809  		lo |= int64(lowBits)
   810  	}
   811  
   812  	return decimal128.New(hi, uint64(lo)), nil
   813  }
   814  
   815  func bigEndianToDecimal256(buf []byte) (decimal256.Num, error) {
   816  	const (
   817  		minDecimalBytes = 1
   818  		maxDecimalBytes = 32
   819  	)
   820  
   821  	if len(buf) < minDecimalBytes || len(buf) > maxDecimalBytes {
   822  		return decimal256.Num{},
   823  			fmt.Errorf("%w: length of byte array for bigEndianToDecimal256 was %d but must be between %d and %d",
   824  				arrow.ErrInvalid, len(buf), minDecimalBytes, maxDecimalBytes)
   825  	}
   826  
   827  	var littleEndian [4]uint64
   828  	// bytes are coming in big-endian, so the first byte is the MSB and
   829  	// therefore holds the sign bit
   830  	initWord, isNeg := uint64(0), int8(buf[0]) < 0
   831  	if isNeg {
   832  		// sign extend if necessary
   833  		initWord = uint64(0xFFFFFFFFFFFFFFFF)
   834  	}
   835  
   836  	for wordIdx := 0; wordIdx < 4; wordIdx++ {
   837  		wordLen := utils.MinInt(len(buf), arrow.Uint64SizeBytes)
   838  		word := buf[len(buf)-wordLen:]
   839  
   840  		if wordLen == 8 {
   841  			// full words can be assigned as-is
   842  			littleEndian[wordIdx] = binary.BigEndian.Uint64(word)
   843  		} else {
   844  			result := initWord
   845  			if len(buf) > 0 {
   846  				// incorporate the actual values if present
   847  				// shift left enough bits to make room for the incoming int64
   848  				result = result << uint64(wordLen)
   849  				// preserve the upper bits by inplace OR-ing the int64
   850  				result |= uint64FromBigEndianShifted(word)
   851  			}
   852  			littleEndian[wordIdx] = result
   853  		}
   854  
   855  		buf = buf[:len(buf)-wordLen]
   856  	}
   857  
   858  	return decimal256.New(littleEndian[3], littleEndian[2], littleEndian[1], littleEndian[0]), nil
   859  }
   860  
   861  type varOrFixedBin interface {
   862  	arrow.Array
   863  	Value(i int) []byte
   864  }
   865  
   866  // convert physical byte storage, instead of integers, to decimal128
   867  func transferDecimalBytes(rdr file.BinaryRecordReader, dt arrow.DataType) (*arrow.Chunked, error) {
   868  	convert128 := func(in varOrFixedBin) (arrow.Array, error) {
   869  		length := in.Len()
   870  		data := make([]byte, arrow.Decimal128Traits.BytesRequired(length))
   871  		out := arrow.Decimal128Traits.CastFromBytes(data)
   872  
   873  		nullCount := in.NullN()
   874  		var err error
   875  		for i := 0; i < length; i++ {
   876  			if nullCount > 0 && in.IsNull(i) {
   877  				continue
   878  			}
   879  
   880  			rec := in.Value(i)
   881  			if len(rec) <= 0 {
   882  				return nil, fmt.Errorf("invalud BYTEARRAY length for type: %s", dt)
   883  			}
   884  			out[i], err = bigEndianToDecimal128(rec)
   885  			if err != nil {
   886  				return nil, err
   887  			}
   888  		}
   889  
   890  		ret := array.NewData(dt, length, []*memory.Buffer{
   891  			in.Data().Buffers()[0], memory.NewBufferBytes(data),
   892  		}, nil, nullCount, 0)
   893  		defer ret.Release()
   894  		return array.MakeFromData(ret), nil
   895  	}
   896  
   897  	convert256 := func(in varOrFixedBin) (arrow.Array, error) {
   898  		length := in.Len()
   899  		data := make([]byte, arrow.Decimal256Traits.BytesRequired(length))
   900  		out := arrow.Decimal256Traits.CastFromBytes(data)
   901  
   902  		nullCount := in.NullN()
   903  		var err error
   904  		for i := 0; i < length; i++ {
   905  			if nullCount > 0 && in.IsNull(i) {
   906  				continue
   907  			}
   908  
   909  			rec := in.Value(i)
   910  			if len(rec) <= 0 {
   911  				return nil, fmt.Errorf("invalid BYTEARRAY length for type: %s", dt)
   912  			}
   913  			out[i], err = bigEndianToDecimal256(rec)
   914  			if err != nil {
   915  				return nil, err
   916  			}
   917  		}
   918  
   919  		ret := array.NewData(dt, length, []*memory.Buffer{
   920  			in.Data().Buffers()[0], memory.NewBufferBytes(data),
   921  		}, nil, nullCount, 0)
   922  		defer ret.Release()
   923  		return array.MakeFromData(ret), nil
   924  	}
   925  
   926  	convert := func(arr arrow.Array) (arrow.Array, error) {
   927  		switch dt.ID() {
   928  		case arrow.DECIMAL128:
   929  			return convert128(arr.(varOrFixedBin))
   930  		case arrow.DECIMAL256:
   931  			return convert256(arr.(varOrFixedBin))
   932  		}
   933  		return nil, arrow.ErrNotImplemented
   934  	}
   935  
   936  	chunks := rdr.GetBuilderChunks()
   937  	var err error
   938  	for idx, chunk := range chunks {
   939  		defer chunk.Release()
   940  		if chunks[idx], err = convert(chunk); err != nil {
   941  			return nil, err
   942  		}
   943  		defer chunks[idx].Release()
   944  	}
   945  	return arrow.NewChunked(dt, chunks), nil
   946  }
   947  
   948  func transferDictionary(rdr file.RecordReader, logicalValueType arrow.DataType) *arrow.Chunked {
   949  	brdr := rdr.(file.BinaryRecordReader)
   950  	chunks := brdr.GetBuilderChunks()
   951  	defer releaseArrays(chunks)
   952  	return arrow.NewChunked(logicalValueType, chunks)
   953  }