github.com/apache/arrow/go/v7@v7.0.1/parquet/pqarrow/column_readers.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow
    18  
    19  import (
    20  	"encoding/binary"
    21  	"reflect"
    22  	"sync/atomic"
    23  	"time"
    24  	"unsafe"
    25  
    26  	"github.com/apache/arrow/go/v7/arrow"
    27  	"github.com/apache/arrow/go/v7/arrow/array"
    28  	"github.com/apache/arrow/go/v7/arrow/bitutil"
    29  	"github.com/apache/arrow/go/v7/arrow/decimal128"
    30  	"github.com/apache/arrow/go/v7/arrow/memory"
    31  	"github.com/apache/arrow/go/v7/parquet"
    32  	"github.com/apache/arrow/go/v7/parquet/file"
    33  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    34  	"github.com/apache/arrow/go/v7/parquet/schema"
    35  	"golang.org/x/xerrors"
    36  )
    37  
    38  // column reader for leaf columns (non-nested)
    39  type leafReader struct {
    40  	out       *arrow.Chunked
    41  	rctx      *readerCtx
    42  	field     *arrow.Field
    43  	input     *columnIterator
    44  	descr     *schema.Column
    45  	recordRdr file.RecordReader
    46  
    47  	refCount int64
    48  }
    49  
    50  func newLeafReader(rctx *readerCtx, field *arrow.Field, input *columnIterator, leafInfo file.LevelInfo) (*ColumnReader, error) {
    51  	ret := &leafReader{
    52  		rctx:      rctx,
    53  		field:     field,
    54  		input:     input,
    55  		descr:     input.Descr(),
    56  		recordRdr: file.NewRecordReader(input.Descr(), leafInfo, field.Type.ID() == arrow.DICTIONARY, rctx.mem),
    57  		refCount:  1,
    58  	}
    59  	err := ret.nextRowGroup()
    60  	return &ColumnReader{ret}, err
    61  }
    62  
    63  func (lr *leafReader) Retain() {
    64  	atomic.AddInt64(&lr.refCount, 1)
    65  }
    66  
    67  func (lr *leafReader) Release() {
    68  	if atomic.AddInt64(&lr.refCount, -1) == 0 {
    69  		if lr.out != nil {
    70  			lr.out.Release()
    71  			lr.out = nil
    72  		}
    73  		if lr.recordRdr != nil {
    74  			lr.recordRdr.Release()
    75  			lr.recordRdr = nil
    76  		}
    77  	}
    78  }
    79  
    80  func (lr *leafReader) GetDefLevels() ([]int16, error) {
    81  	return lr.recordRdr.DefLevels()[:int(lr.recordRdr.LevelsPos())], nil
    82  }
    83  
    84  func (lr *leafReader) GetRepLevels() ([]int16, error) {
    85  	return lr.recordRdr.RepLevels()[:int(lr.recordRdr.LevelsPos())], nil
    86  }
    87  
    88  func (lr *leafReader) IsOrHasRepeatedChild() bool { return false }
    89  
    90  func (lr *leafReader) LoadBatch(nrecords int64) (err error) {
    91  	if lr.out != nil {
    92  		lr.out.Release()
    93  		lr.out = nil
    94  	}
    95  	lr.recordRdr.Reset()
    96  
    97  	if err := lr.recordRdr.Reserve(nrecords); err != nil {
    98  		return err
    99  	}
   100  	for nrecords > 0 {
   101  		if !lr.recordRdr.HasMore() {
   102  			break
   103  		}
   104  		numRead, err := lr.recordRdr.ReadRecords(nrecords)
   105  		if err != nil {
   106  			return err
   107  		}
   108  		nrecords -= numRead
   109  		if numRead == 0 {
   110  			if err = lr.nextRowGroup(); err != nil {
   111  				return err
   112  			}
   113  		}
   114  	}
   115  	lr.out, err = transferColumnData(lr.recordRdr, lr.field.Type, lr.descr, lr.rctx.mem)
   116  	return
   117  }
   118  
   119  func (lr *leafReader) BuildArray(_ int64) (*arrow.Chunked, error) {
   120  	return lr.out, nil
   121  }
   122  
   123  func (lr *leafReader) Field() *arrow.Field { return lr.field }
   124  
   125  func (lr *leafReader) nextRowGroup() error {
   126  	pr, err := lr.input.NextChunk()
   127  	if err != nil {
   128  		return err
   129  	}
   130  	lr.recordRdr.SetPageReader(pr)
   131  	return nil
   132  }
   133  
   134  // column reader for struct arrays, has readers for each child which could
   135  // themselves be nested or leaf columns.
   136  type structReader struct {
   137  	rctx             *readerCtx
   138  	filtered         *arrow.Field
   139  	levelInfo        file.LevelInfo
   140  	children         []*ColumnReader
   141  	defRepLevelChild *ColumnReader
   142  	hasRepeatedChild bool
   143  
   144  	refCount int64
   145  }
   146  
   147  func (sr *structReader) Retain() {
   148  	atomic.AddInt64(&sr.refCount, 1)
   149  }
   150  
   151  func (sr *structReader) Release() {
   152  	if atomic.AddInt64(&sr.refCount, -1) == 0 {
   153  		if sr.defRepLevelChild != nil {
   154  			sr.defRepLevelChild.Release()
   155  			sr.defRepLevelChild = nil
   156  		}
   157  		for _, c := range sr.children {
   158  			c.Release()
   159  		}
   160  		sr.children = nil
   161  	}
   162  }
   163  
   164  func newStructReader(rctx *readerCtx, filtered *arrow.Field, levelInfo file.LevelInfo, children []*ColumnReader) *ColumnReader {
   165  	// there could be a mix of children some might be repeated and some might not be
   166  	// if possible use one that isn't since that will be guaranteed to have the least
   167  	// number of levels to reconstruct a nullable bitmap
   168  	var result *ColumnReader
   169  	for _, child := range children {
   170  		if !child.IsOrHasRepeatedChild() {
   171  			result = child
   172  		}
   173  	}
   174  
   175  	ret := &structReader{
   176  		rctx:      rctx,
   177  		filtered:  filtered,
   178  		levelInfo: levelInfo,
   179  		children:  children,
   180  		refCount:  1,
   181  	}
   182  	if result != nil {
   183  		ret.defRepLevelChild = result
   184  		ret.hasRepeatedChild = false
   185  	} else {
   186  		ret.defRepLevelChild = children[0]
   187  		ret.hasRepeatedChild = true
   188  	}
   189  	ret.defRepLevelChild.Retain()
   190  	return &ColumnReader{ret}
   191  }
   192  
   193  func (sr *structReader) IsOrHasRepeatedChild() bool { return sr.hasRepeatedChild }
   194  
   195  func (sr *structReader) GetDefLevels() ([]int16, error) {
   196  	if len(sr.children) == 0 {
   197  		return nil, xerrors.New("struct raeder has no children")
   198  	}
   199  
   200  	// this method should only be called when this struct or one of its parents
   201  	// are optional/repeated or has a repeated child
   202  	// meaning all children must have rep/def levels associated with them
   203  	return sr.defRepLevelChild.GetDefLevels()
   204  }
   205  
   206  func (sr *structReader) GetRepLevels() ([]int16, error) {
   207  	if len(sr.children) == 0 {
   208  		return nil, xerrors.New("struct raeder has no children")
   209  	}
   210  
   211  	// this method should only be called when this struct or one of its parents
   212  	// are optional/repeated or has a repeated child
   213  	// meaning all children must have rep/def levels associated with them
   214  	return sr.defRepLevelChild.GetRepLevels()
   215  }
   216  
   217  func (sr *structReader) LoadBatch(nrecords int64) error {
   218  	for _, rdr := range sr.children {
   219  		if err := rdr.LoadBatch(nrecords); err != nil {
   220  			return err
   221  		}
   222  	}
   223  	return nil
   224  }
   225  
   226  func (sr *structReader) Field() *arrow.Field { return sr.filtered }
   227  
   228  func (sr *structReader) BuildArray(lenBound int64) (*arrow.Chunked, error) {
   229  	validityIO := file.ValidityBitmapInputOutput{
   230  		ReadUpperBound: lenBound,
   231  		Read:           lenBound,
   232  	}
   233  
   234  	var nullBitmap *memory.Buffer
   235  
   236  	if sr.hasRepeatedChild {
   237  		nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
   238  		nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
   239  		validityIO.ValidBits = nullBitmap.Bytes()
   240  		defLevels, err := sr.GetDefLevels()
   241  		if err != nil {
   242  			return nil, err
   243  		}
   244  		repLevels, err := sr.GetRepLevels()
   245  		if err != nil {
   246  			return nil, err
   247  		}
   248  
   249  		if err := file.DefRepLevelsToBitmap(defLevels, repLevels, sr.levelInfo, &validityIO); err != nil {
   250  			return nil, err
   251  		}
   252  
   253  	} else if sr.filtered.Nullable {
   254  		nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
   255  		nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
   256  		validityIO.ValidBits = nullBitmap.Bytes()
   257  		defLevels, err := sr.GetDefLevels()
   258  		if err != nil {
   259  			return nil, err
   260  		}
   261  
   262  		file.DefLevelsToBitmap(defLevels, sr.levelInfo, &validityIO)
   263  	}
   264  
   265  	if nullBitmap != nil {
   266  		nullBitmap.Resize(int(bitutil.BytesForBits(validityIO.Read)))
   267  	}
   268  
   269  	childArrData := make([]arrow.ArrayData, 0)
   270  	// gather children arrays and def levels
   271  	for _, child := range sr.children {
   272  		field, err := child.BuildArray(validityIO.Read)
   273  		if err != nil {
   274  			return nil, err
   275  		}
   276  		arrdata, err := chunksToSingle(field)
   277  		if err != nil {
   278  			return nil, err
   279  		}
   280  		childArrData = append(childArrData, arrdata)
   281  	}
   282  
   283  	if !sr.filtered.Nullable && !sr.hasRepeatedChild {
   284  		validityIO.Read = int64(childArrData[0].Len())
   285  	}
   286  
   287  	buffers := make([]*memory.Buffer, 1)
   288  	if validityIO.NullCount > 0 {
   289  		buffers[0] = nullBitmap
   290  	}
   291  
   292  	data := array.NewData(sr.filtered.Type, int(validityIO.Read), buffers, childArrData, int(validityIO.NullCount), 0)
   293  	defer data.Release()
   294  	arr := array.MakeFromData(data)
   295  	defer arr.Release()
   296  	return arrow.NewChunked(sr.filtered.Type, []arrow.Array{arr}), nil
   297  }
   298  
   299  // column reader for repeated columns specifically for list arrays
   300  type listReader struct {
   301  	rctx    *readerCtx
   302  	field   *arrow.Field
   303  	info    file.LevelInfo
   304  	itemRdr *ColumnReader
   305  
   306  	refCount int64
   307  }
   308  
   309  func newListReader(rctx *readerCtx, field *arrow.Field, info file.LevelInfo, childRdr *ColumnReader) *ColumnReader {
   310  	childRdr.Retain()
   311  	return &ColumnReader{&listReader{rctx, field, info, childRdr, 1}}
   312  }
   313  
   314  func (lr *listReader) Retain() {
   315  	atomic.AddInt64(&lr.refCount, 1)
   316  }
   317  
   318  func (lr *listReader) Release() {
   319  	if atomic.AddInt64(&lr.refCount, -1) == 0 {
   320  		if lr.itemRdr != nil {
   321  			lr.itemRdr.Release()
   322  			lr.itemRdr = nil
   323  		}
   324  	}
   325  }
   326  
   327  func (lr *listReader) GetDefLevels() ([]int16, error) {
   328  	return lr.itemRdr.GetDefLevels()
   329  }
   330  
   331  func (lr *listReader) GetRepLevels() ([]int16, error) {
   332  	return lr.itemRdr.GetRepLevels()
   333  }
   334  
   335  func (lr *listReader) Field() *arrow.Field { return lr.field }
   336  
   337  func (lr *listReader) IsOrHasRepeatedChild() bool { return true }
   338  
   339  func (lr *listReader) LoadBatch(nrecords int64) error {
   340  	return lr.itemRdr.LoadBatch(nrecords)
   341  }
   342  
   343  func (lr *listReader) BuildArray(lenBound int64) (*arrow.Chunked, error) {
   344  	var (
   345  		defLevels      []int16
   346  		repLevels      []int16
   347  		err            error
   348  		validityBuffer *memory.Buffer
   349  	)
   350  
   351  	if defLevels, err = lr.itemRdr.GetDefLevels(); err != nil {
   352  		return nil, err
   353  	}
   354  	if repLevels, err = lr.itemRdr.GetRepLevels(); err != nil {
   355  		return nil, err
   356  	}
   357  
   358  	validityIO := file.ValidityBitmapInputOutput{ReadUpperBound: lenBound}
   359  	if lr.field.Nullable {
   360  		validityBuffer = memory.NewResizableBuffer(lr.rctx.mem)
   361  		validityBuffer.Resize(int(bitutil.BytesForBits(lenBound)))
   362  		defer validityBuffer.Release()
   363  		validityIO.ValidBits = validityBuffer.Bytes()
   364  	}
   365  	offsetsBuffer := memory.NewResizableBuffer(lr.rctx.mem)
   366  	offsetsBuffer.Resize(arrow.Int32Traits.BytesRequired(int(lenBound) + 1))
   367  	defer offsetsBuffer.Release()
   368  
   369  	offsetData := arrow.Int32Traits.CastFromBytes(offsetsBuffer.Bytes())
   370  	if err = file.DefRepLevelsToListInfo(defLevels, repLevels, lr.info, &validityIO, offsetData); err != nil {
   371  		return nil, err
   372  	}
   373  
   374  	arr, err := lr.itemRdr.BuildArray(int64(offsetData[int(validityIO.Read)]))
   375  	if err != nil {
   376  		return nil, err
   377  	}
   378  
   379  	// resize to actual number of elems returned
   380  	offsetsBuffer.Resize(arrow.Int32Traits.BytesRequired(int(validityIO.Read) + 1))
   381  	if validityBuffer != nil {
   382  		validityBuffer.Resize(int(bitutil.BytesForBits(validityIO.Read)))
   383  	}
   384  
   385  	item, err := chunksToSingle(arr)
   386  	if err != nil {
   387  		return nil, err
   388  	}
   389  	defer item.Release()
   390  
   391  	buffers := []*memory.Buffer{nil, offsetsBuffer}
   392  	if validityIO.NullCount > 0 {
   393  		buffers[0] = validityBuffer
   394  	}
   395  
   396  	data := array.NewData(lr.field.Type, int(validityIO.Read), buffers, []arrow.ArrayData{item}, int(validityIO.NullCount), 0)
   397  	defer data.Release()
   398  	if lr.field.Type.ID() == arrow.FIXED_SIZE_LIST {
   399  		defer data.Buffers()[1].Release()
   400  		listSize := lr.field.Type.(*arrow.FixedSizeListType).Len()
   401  		for x := 1; x < data.Len(); x++ {
   402  			size := offsetData[x] - offsetData[x-1]
   403  			if size != listSize {
   404  				return nil, xerrors.Errorf("expected all lists to be of size=%d, but index %d had size=%d", listSize, x, size)
   405  			}
   406  		}
   407  		data.Buffers()[1] = nil
   408  	}
   409  	out := array.MakeFromData(data)
   410  	defer out.Release()
   411  	return arrow.NewChunked(lr.field.Type, []arrow.Array{out}), nil
   412  }
   413  
   414  // column reader logic for fixed size lists instead of variable length ones.
   415  type fixedSizeListReader struct {
   416  	listReader
   417  }
   418  
   419  func newFixedSizeListReader(rctx *readerCtx, field *arrow.Field, info file.LevelInfo, childRdr *ColumnReader) *ColumnReader {
   420  	childRdr.Retain()
   421  	return &ColumnReader{&fixedSizeListReader{listReader{rctx, field, info, childRdr, 1}}}
   422  }
   423  
   424  // helper function to combine chunks into a single array.
   425  //
   426  // nested data conversion for chunked array outputs not yet implemented
   427  func chunksToSingle(chunked *arrow.Chunked) (arrow.ArrayData, error) {
   428  	switch len(chunked.Chunks()) {
   429  	case 0:
   430  		return array.NewData(chunked.DataType(), 0, []*memory.Buffer{nil, nil}, nil, 0, 0), nil
   431  	case 1:
   432  		return chunked.Chunk(0).Data(), nil
   433  	default: // if an item reader yields a chunked array, this is not yet implemented
   434  		return nil, xerrors.New("not implemented")
   435  	}
   436  }
   437  
   438  // create a chunked arrow array from the raw record data
   439  func transferColumnData(rdr file.RecordReader, valueType arrow.DataType, descr *schema.Column, mem memory.Allocator) (*arrow.Chunked, error) {
   440  	var data arrow.ArrayData
   441  	switch valueType.ID() {
   442  	// case arrow.DICTIONARY:
   443  	case arrow.NULL:
   444  		return arrow.NewChunked(arrow.Null, []arrow.Array{array.NewNull(rdr.ValuesWritten())}), nil
   445  	case arrow.INT32, arrow.INT64, arrow.FLOAT32, arrow.FLOAT64:
   446  		data = transferZeroCopy(rdr, valueType) // can just reference the raw data without copying
   447  	case arrow.BOOL:
   448  		data = transferBool(rdr)
   449  	case arrow.UINT8,
   450  		arrow.UINT16,
   451  		arrow.UINT32,
   452  		arrow.UINT64,
   453  		arrow.INT8,
   454  		arrow.INT16,
   455  		arrow.DATE32,
   456  		arrow.TIME32,
   457  		arrow.TIME64:
   458  		data = transferInt(rdr, valueType)
   459  	case arrow.DATE64:
   460  		data = transferDate64(rdr, valueType)
   461  	case arrow.FIXED_SIZE_BINARY, arrow.BINARY, arrow.STRING:
   462  		return transferBinary(rdr, valueType), nil
   463  	case arrow.DECIMAL:
   464  		switch descr.PhysicalType() {
   465  		case parquet.Types.Int32, parquet.Types.Int64:
   466  			data = transferDecimalInteger(rdr, valueType)
   467  		case parquet.Types.ByteArray, parquet.Types.FixedLenByteArray:
   468  			return transferDecimalBytes(rdr.(file.BinaryRecordReader), valueType)
   469  		default:
   470  			return nil, xerrors.New("physical type for decimal128 must be int32, int64, bytearray or fixed len byte array")
   471  		}
   472  	case arrow.TIMESTAMP:
   473  		tstype := valueType.(*arrow.TimestampType)
   474  		switch tstype.Unit {
   475  		case arrow.Millisecond, arrow.Microsecond:
   476  			data = transferZeroCopy(rdr, valueType)
   477  		case arrow.Nanosecond:
   478  			if descr.PhysicalType() == parquet.Types.Int96 {
   479  				data = transferInt96(rdr, valueType)
   480  			} else {
   481  				data = transferZeroCopy(rdr, valueType)
   482  			}
   483  		default:
   484  			return nil, xerrors.New("time unit not supported")
   485  		}
   486  	default:
   487  		return nil, xerrors.Errorf("no support for reading columns of type: %s", valueType.Name())
   488  	}
   489  
   490  	defer data.Release()
   491  	arr := array.MakeFromData(data)
   492  	defer arr.Release()
   493  	return arrow.NewChunked(valueType, []arrow.Array{arr}), nil
   494  }
   495  
   496  func transferZeroCopy(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData {
   497  	bitmap := rdr.ReleaseValidBits()
   498  	values := rdr.ReleaseValues()
   499  	defer func() {
   500  		if bitmap != nil {
   501  			bitmap.Release()
   502  		}
   503  		if values != nil {
   504  			values.Release()
   505  		}
   506  	}()
   507  
   508  	return array.NewData(dt, rdr.ValuesWritten(), []*memory.Buffer{
   509  		bitmap, values}, nil, int(rdr.NullCount()), 0)
   510  }
   511  
   512  func transferBinary(rdr file.RecordReader, dt arrow.DataType) *arrow.Chunked {
   513  	brdr := rdr.(file.BinaryRecordReader)
   514  	chunks := brdr.GetBuilderChunks()
   515  	if dt == arrow.BinaryTypes.String {
   516  		// convert chunks from binary to string without copying data,
   517  		// just changing the interpretation of the metadata
   518  		for idx := range chunks {
   519  			chunks[idx] = array.MakeFromData(chunks[idx].Data())
   520  			defer chunks[idx].Data().Release()
   521  			defer chunks[idx].Release()
   522  		}
   523  	}
   524  	return arrow.NewChunked(dt, chunks)
   525  }
   526  
   527  func transferInt(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData {
   528  	var (
   529  		output reflect.Value
   530  	)
   531  
   532  	signed := true
   533  	// create buffer for proper type since parquet only has int32 and int64
   534  	// physical representations, but we want the correct type representation
   535  	// for Arrow's in memory buffer.
   536  	data := make([]byte, rdr.ValuesWritten()*int(bitutil.BytesForBits(int64(dt.(arrow.FixedWidthDataType).BitWidth()))))
   537  	switch dt.ID() {
   538  	case arrow.INT8:
   539  		output = reflect.ValueOf(arrow.Int8Traits.CastFromBytes(data))
   540  	case arrow.UINT8:
   541  		signed = false
   542  		output = reflect.ValueOf(arrow.Uint8Traits.CastFromBytes(data))
   543  	case arrow.INT16:
   544  		output = reflect.ValueOf(arrow.Int16Traits.CastFromBytes(data))
   545  	case arrow.UINT16:
   546  		signed = false
   547  		output = reflect.ValueOf(arrow.Uint16Traits.CastFromBytes(data))
   548  	case arrow.UINT32:
   549  		signed = false
   550  		output = reflect.ValueOf(arrow.Uint32Traits.CastFromBytes(data))
   551  	case arrow.UINT64:
   552  		signed = false
   553  		output = reflect.ValueOf(arrow.Uint64Traits.CastFromBytes(data))
   554  	case arrow.DATE32:
   555  		output = reflect.ValueOf(arrow.Date32Traits.CastFromBytes(data))
   556  	case arrow.TIME32:
   557  		output = reflect.ValueOf(arrow.Time32Traits.CastFromBytes(data))
   558  	case arrow.TIME64:
   559  		output = reflect.ValueOf(arrow.Time64Traits.CastFromBytes(data))
   560  	}
   561  
   562  	length := rdr.ValuesWritten()
   563  	// copy the values semantically with the correct types
   564  	switch rdr.Type() {
   565  	case parquet.Types.Int32:
   566  		values := arrow.Int32Traits.CastFromBytes(rdr.Values())
   567  		if signed {
   568  			for idx, v := range values[:length] {
   569  				output.Index(idx).SetInt(int64(v))
   570  			}
   571  		} else {
   572  			for idx, v := range values[:length] {
   573  				output.Index(idx).SetUint(uint64(v))
   574  			}
   575  		}
   576  	case parquet.Types.Int64:
   577  		values := arrow.Int64Traits.CastFromBytes(rdr.Values())
   578  		if signed {
   579  			for idx, v := range values[:length] {
   580  				output.Index(idx).SetInt(v)
   581  			}
   582  		} else {
   583  			for idx, v := range values[:length] {
   584  				output.Index(idx).SetUint(uint64(v))
   585  			}
   586  		}
   587  	}
   588  
   589  	bitmap := rdr.ReleaseValidBits()
   590  	if bitmap != nil {
   591  		defer bitmap.Release()
   592  	}
   593  
   594  	return array.NewData(dt, rdr.ValuesWritten(), []*memory.Buffer{
   595  		bitmap, memory.NewBufferBytes(data),
   596  	}, nil, int(rdr.NullCount()), 0)
   597  }
   598  
   599  func transferBool(rdr file.RecordReader) arrow.ArrayData {
   600  	// TODO(mtopol): optimize this so we don't convert bitmap to []bool back to bitmap
   601  	length := rdr.ValuesWritten()
   602  	data := make([]byte, int(bitutil.BytesForBits(int64(length))))
   603  	bytedata := rdr.Values()
   604  	values := *(*[]bool)(unsafe.Pointer(&bytedata))
   605  
   606  	for idx, v := range values[:length] {
   607  		if v {
   608  			bitutil.SetBit(data, idx)
   609  		}
   610  	}
   611  
   612  	bitmap := rdr.ReleaseValidBits()
   613  	if bitmap != nil {
   614  		defer bitmap.Release()
   615  	}
   616  	return array.NewData(&arrow.BooleanType{}, length, []*memory.Buffer{
   617  		bitmap, memory.NewBufferBytes(data),
   618  	}, nil, int(rdr.NullCount()), 0)
   619  }
   620  
   621  var milliPerDay = time.Duration(24 * time.Hour).Milliseconds()
   622  
   623  // parquet equivalent for date64 is a 32-bit integer of the number of days
   624  // since the epoch. Convert each value to milliseconds for date64
   625  func transferDate64(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData {
   626  	length := rdr.ValuesWritten()
   627  	values := arrow.Int32Traits.CastFromBytes(rdr.Values())
   628  
   629  	data := make([]byte, arrow.Int64Traits.BytesRequired(length))
   630  	out := arrow.Int64Traits.CastFromBytes(data)
   631  	for idx, val := range values[:length] {
   632  		out[idx] = int64(val) * milliPerDay
   633  	}
   634  
   635  	bitmap := rdr.ReleaseValidBits()
   636  	if bitmap != nil {
   637  		defer bitmap.Release()
   638  	}
   639  	return array.NewData(dt, length, []*memory.Buffer{
   640  		bitmap, memory.NewBufferBytes(data),
   641  	}, nil, int(rdr.NullCount()), 0)
   642  }
   643  
   644  // coerce int96 to nanosecond timestamp
   645  func transferInt96(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData {
   646  	length := rdr.ValuesWritten()
   647  	values := parquet.Int96Traits.CastFromBytes(rdr.Values())
   648  
   649  	data := make([]byte, arrow.Int64SizeBytes*length)
   650  	out := arrow.Int64Traits.CastFromBytes(data)
   651  
   652  	for idx, val := range values[:length] {
   653  		if binary.LittleEndian.Uint32(val[8:]) == 0 {
   654  			out[idx] = 0
   655  		} else {
   656  			out[idx] = val.ToTime().UnixNano()
   657  		}
   658  	}
   659  
   660  	bitmap := rdr.ReleaseValidBits()
   661  	if bitmap != nil {
   662  		defer bitmap.Release()
   663  	}
   664  	return array.NewData(dt, length, []*memory.Buffer{
   665  		bitmap, memory.NewBufferBytes(data),
   666  	}, nil, int(rdr.NullCount()), 0)
   667  }
   668  
   669  // convert physical integer storage of a decimal logical type to a decimal128 typed array
   670  func transferDecimalInteger(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData {
   671  	length := rdr.ValuesWritten()
   672  
   673  	var values reflect.Value
   674  	switch rdr.Type() {
   675  	case parquet.Types.Int32:
   676  		values = reflect.ValueOf(arrow.Int32Traits.CastFromBytes(rdr.Values())[:length])
   677  	case parquet.Types.Int64:
   678  		values = reflect.ValueOf(arrow.Int64Traits.CastFromBytes(rdr.Values())[:length])
   679  	}
   680  
   681  	data := make([]byte, arrow.Decimal128Traits.BytesRequired(length))
   682  	out := arrow.Decimal128Traits.CastFromBytes(data)
   683  	for i := 0; i < values.Len(); i++ {
   684  		out[i] = decimal128.FromI64(values.Index(i).Int())
   685  	}
   686  
   687  	var nullmap *memory.Buffer
   688  	if rdr.NullCount() > 0 {
   689  		nullmap = rdr.ReleaseValidBits()
   690  		defer nullmap.Release()
   691  	}
   692  	return array.NewData(dt, length, []*memory.Buffer{
   693  		nullmap, memory.NewBufferBytes(data),
   694  	}, nil, int(rdr.NullCount()), 0)
   695  }
   696  
   697  func uint64FromBigEndianShifted(buf []byte) uint64 {
   698  	var (
   699  		bytes [8]byte
   700  	)
   701  	copy(bytes[8-len(buf):], buf)
   702  	return binary.BigEndian.Uint64(bytes[:])
   703  }
   704  
   705  // parquet's defined encoding for decimal data is for it to be written as big
   706  // endian bytes, so convert a bit endian byte order to a decimal128
   707  func bigEndianToDecimal128(buf []byte) (decimal128.Num, error) {
   708  	const (
   709  		minDecimalBytes = 1
   710  		maxDecimalBytes = 16
   711  	)
   712  
   713  	if len(buf) < minDecimalBytes || len(buf) > maxDecimalBytes {
   714  		return decimal128.Num{}, xerrors.Errorf("length of byte array passed to bigEndianToDecimal128 was %d but must be between %d and %d",
   715  			len(buf), minDecimalBytes, maxDecimalBytes)
   716  	}
   717  
   718  	// bytes are big endian so first byte is MSB and holds the sign bit
   719  	isNeg := int8(buf[0]) < 0
   720  
   721  	// 1. extract high bits
   722  	highBitsOffset := utils.MaxInt(0, len(buf)-8)
   723  	var (
   724  		highBits uint64
   725  		lowBits  uint64
   726  		hi       int64
   727  		lo       int64
   728  	)
   729  	highBits = uint64FromBigEndianShifted(buf[:highBitsOffset])
   730  
   731  	if highBitsOffset == 8 {
   732  		hi = int64(highBits)
   733  	} else {
   734  		if isNeg && len(buf) < maxDecimalBytes {
   735  			hi = -1
   736  		}
   737  
   738  		hi = int64(uint64(hi) << (uint64(highBitsOffset) * 8))
   739  		hi |= int64(highBits)
   740  	}
   741  
   742  	// 2. extract lower bits
   743  	lowBitsOffset := utils.MinInt(len(buf), 8)
   744  	lowBits = uint64FromBigEndianShifted(buf[highBitsOffset:])
   745  
   746  	if lowBitsOffset == 8 {
   747  		lo = int64(lowBits)
   748  	} else {
   749  		if isNeg && len(buf) < 8 {
   750  			lo = -1
   751  		}
   752  
   753  		lo = int64(uint64(lo) << (uint64(lowBitsOffset) * 8))
   754  		lo |= int64(lowBits)
   755  	}
   756  
   757  	return decimal128.New(hi, uint64(lo)), nil
   758  }
   759  
   760  type varOrFixedBin interface {
   761  	arrow.Array
   762  	Value(i int) []byte
   763  }
   764  
   765  // convert physical byte storage, instead of integers, to decimal128
   766  func transferDecimalBytes(rdr file.BinaryRecordReader, dt arrow.DataType) (*arrow.Chunked, error) {
   767  	convert := func(arr arrow.Array) (arrow.Array, error) {
   768  		length := arr.Len()
   769  		data := make([]byte, arrow.Decimal128Traits.BytesRequired(length))
   770  		out := arrow.Decimal128Traits.CastFromBytes(data)
   771  
   772  		input := arr.(varOrFixedBin)
   773  		nullCount := input.NullN()
   774  
   775  		var err error
   776  		for i := 0; i < length; i++ {
   777  			if nullCount > 0 && input.IsNull(i) {
   778  				continue
   779  			}
   780  
   781  			rec := input.Value(i)
   782  			if len(rec) <= 0 {
   783  				return nil, xerrors.Errorf("invalud BYTEARRAY length for type: %s", dt)
   784  			}
   785  			out[i], err = bigEndianToDecimal128(rec)
   786  			if err != nil {
   787  				return nil, err
   788  			}
   789  		}
   790  
   791  		ret := array.NewData(dt, length, []*memory.Buffer{
   792  			input.Data().Buffers()[0], memory.NewBufferBytes(data),
   793  		}, nil, nullCount, 0)
   794  		defer ret.Release()
   795  		return array.MakeFromData(ret), nil
   796  	}
   797  
   798  	chunks := rdr.GetBuilderChunks()
   799  	var err error
   800  	for idx, chunk := range chunks {
   801  		defer chunk.Release()
   802  		if chunks[idx], err = convert(chunk); err != nil {
   803  			return nil, err
   804  		}
   805  		defer chunks[idx].Release()
   806  	}
   807  	return arrow.NewChunked(dt, chunks), nil
   808  }