github.com/apache/arrow/go/v14@v14.0.1/parquet/internal/encoding/byte_array_decoder.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "encoding/binary" 21 22 "github.com/apache/arrow/go/v14/arrow" 23 "github.com/apache/arrow/go/v14/arrow/array" 24 "github.com/apache/arrow/go/v14/arrow/memory" 25 "github.com/apache/arrow/go/v14/internal/utils" 26 "github.com/apache/arrow/go/v14/parquet" 27 pqutils "github.com/apache/arrow/go/v14/parquet/internal/utils" 28 "golang.org/x/xerrors" 29 ) 30 31 // PlainByteArrayDecoder decodes a data chunk for bytearrays according to 32 // the plain encoding. The byte arrays will use slices to reference the 33 // data rather than copying it. 34 // 35 // The parquet spec defines Plain encoding for ByteArrays as a 4 byte little 36 // endian integer containing the length of the bytearray followed by that many 37 // bytes being the raw data of the byte array. 38 type PlainByteArrayDecoder struct { 39 decoder 40 } 41 42 // Type returns parquet.Types.ByteArray for this decoder 43 func (PlainByteArrayDecoder) Type() parquet.Type { 44 return parquet.Types.ByteArray 45 } 46 47 // Decode will populate the slice of bytearrays in full or until the number 48 // of values is consumed. 49 // 50 // Returns the number of values that were decoded. 51 func (pbad *PlainByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { 52 max := utils.MinInt(len(out), pbad.nvals) 53 54 for i := 0; i < max; i++ { 55 // there should always be at least four bytes which is the length of the 56 // next value in the data. 57 if len(pbad.data) < 4 { 58 return i, xerrors.New("parquet: eof reading bytearray") 59 } 60 61 // the first 4 bytes are a little endian int32 length 62 byteLen := int32(binary.LittleEndian.Uint32(pbad.data[:4])) 63 if byteLen < 0 { 64 return i, xerrors.New("parquet: invalid BYTE_ARRAY value") 65 } 66 67 if int64(len(pbad.data)) < int64(byteLen)+4 { 68 return i, xerrors.New("parquet: eof reading bytearray") 69 } 70 71 out[i] = pbad.data[4 : byteLen+4 : byteLen+4] 72 pbad.data = pbad.data[byteLen+4:] 73 } 74 75 pbad.nvals -= max 76 return max, nil 77 } 78 79 // DecodeSpaced is like Decode, but expands the slice out to leave empty values 80 // where the validBits bitmap has 0s 81 func (pbad *PlainByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { 82 toRead := len(out) - nullCount 83 valuesRead, err := pbad.Decode(out[:toRead]) 84 if err != nil { 85 return valuesRead, err 86 } 87 if valuesRead != toRead { 88 return valuesRead, xerrors.New("parquet: number of values / definition levels read did not match") 89 } 90 91 return spacedExpand(out, nullCount, validBits, validBitsOffset), nil 92 } 93 94 func (d *DictByteArrayDecoder) InsertDictionary(bldr array.Builder) error { 95 conv := d.dictValueDecoder.(*ByteArrayDictConverter) 96 dictLength := cap(conv.dict) 97 conv.ensure(pqutils.IndexType(dictLength)) 98 99 byteArrayData := memory.NewResizableBuffer(d.mem) 100 defer byteArrayData.Release() 101 byteArrayOffsets := memory.NewResizableBuffer(d.mem) 102 defer byteArrayOffsets.Release() 103 104 var totalLen int 105 for _, v := range conv.dict { 106 totalLen += len(v) 107 } 108 byteArrayData.ResizeNoShrink(totalLen) 109 byteArrayOffsets.ResizeNoShrink((dictLength + 1) * arrow.Int32SizeBytes) 110 111 byteData := byteArrayData.Bytes() 112 byteOffsets := arrow.Int32Traits.CastFromBytes(byteArrayOffsets.Bytes()) 113 114 var offset int32 115 for i, v := range conv.dict { 116 n := copy(byteData, v) 117 byteData, byteOffsets[i] = byteData[n:], offset 118 offset += int32(n) 119 } 120 byteOffsets[dictLength] = offset 121 122 data := array.NewData(bldr.Type().(*arrow.DictionaryType).ValueType, dictLength, 123 []*memory.Buffer{nil, byteArrayOffsets, byteArrayData}, nil, 0, 0) 124 defer data.Release() 125 arr := array.NewBinaryData(data) 126 defer arr.Release() 127 128 binaryBldr := bldr.(*array.BinaryDictionaryBuilder) 129 return binaryBldr.InsertDictValues(arr) 130 }