github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/encoding/byte_array_decoder.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "encoding/binary" 21 22 "github.com/apache/arrow/go/v7/parquet" 23 "github.com/apache/arrow/go/v7/parquet/internal/utils" 24 "golang.org/x/xerrors" 25 ) 26 27 // PlainByteArrayDecoder decodes a data chunk for bytearrays according to 28 // the plain encoding. The byte arrays will use slices to reference the 29 // data rather than copying it. 30 // 31 // The parquet spec defines Plain encoding for ByteArrays as a 4 byte little 32 // endian integer containing the length of the bytearray followed by that many 33 // bytes being the raw data of the byte array. 34 type PlainByteArrayDecoder struct { 35 decoder 36 } 37 38 // Type returns parquet.Types.ByteArray for this decoder 39 func (PlainByteArrayDecoder) Type() parquet.Type { 40 return parquet.Types.ByteArray 41 } 42 43 // Decode will populate the slice of bytearrays in full or until the number 44 // of values is consumed. 45 // 46 // Returns the number of values that were decoded. 47 func (pbad *PlainByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) { 48 max := utils.MinInt(len(out), pbad.nvals) 49 50 for i := 0; i < max; i++ { 51 // there should always be at least four bytes which is the length of the 52 // next value in the data. 53 if len(pbad.data) < 4 { 54 return i, xerrors.New("parquet: eof reading bytearray") 55 } 56 57 // the first 4 bytes are a little endian int32 length 58 byteLen := int32(binary.LittleEndian.Uint32(pbad.data[:4])) 59 if byteLen < 0 { 60 return i, xerrors.New("parquet: invalid BYTE_ARRAY value") 61 } 62 63 if int64(len(pbad.data)) < int64(byteLen)+4 { 64 return i, xerrors.New("parquet: eof reading bytearray") 65 } 66 67 out[i] = pbad.data[4 : byteLen+4 : byteLen+4] 68 pbad.data = pbad.data[byteLen+4:] 69 } 70 71 pbad.nvals -= max 72 return max, nil 73 } 74 75 // DecodeSpaced is like Decode, but expands the slice out to leave empty values 76 // where the validBits bitmap has 0s 77 func (pbad *PlainByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { 78 toRead := len(out) - nullCount 79 valuesRead, err := pbad.Decode(out[:toRead]) 80 if err != nil { 81 return valuesRead, err 82 } 83 if valuesRead != toRead { 84 return valuesRead, xerrors.New("parquet: number of values / definition levels read did not match") 85 } 86 87 return spacedExpand(out, nullCount, validBits, validBitsOffset), nil 88 }