github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/encoding/byte_array_decoder.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"encoding/binary"
    21  
    22  	"github.com/apache/arrow/go/v7/parquet"
    23  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    24  	"golang.org/x/xerrors"
    25  )
    26  
    27  // PlainByteArrayDecoder decodes a data chunk for bytearrays according to
    28  // the plain encoding. The byte arrays will use slices to reference the
    29  // data rather than copying it.
    30  //
    31  // The parquet spec defines Plain encoding for ByteArrays as a 4 byte little
    32  // endian integer containing the length of the bytearray followed by that many
    33  // bytes being the raw data of the byte array.
    34  type PlainByteArrayDecoder struct {
    35  	decoder
    36  }
    37  
    38  // Type returns parquet.Types.ByteArray for this decoder
    39  func (PlainByteArrayDecoder) Type() parquet.Type {
    40  	return parquet.Types.ByteArray
    41  }
    42  
    43  // Decode will populate the slice of bytearrays in full or until the number
    44  // of values is consumed.
    45  //
    46  // Returns the number of values that were decoded.
    47  func (pbad *PlainByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) {
    48  	max := utils.MinInt(len(out), pbad.nvals)
    49  
    50  	for i := 0; i < max; i++ {
    51  		// there should always be at least four bytes which is the length of the
    52  		// next value in the data.
    53  		if len(pbad.data) < 4 {
    54  			return i, xerrors.New("parquet: eof reading bytearray")
    55  		}
    56  
    57  		// the first 4 bytes are a little endian int32 length
    58  		byteLen := int32(binary.LittleEndian.Uint32(pbad.data[:4]))
    59  		if byteLen < 0 {
    60  			return i, xerrors.New("parquet: invalid BYTE_ARRAY value")
    61  		}
    62  
    63  		if int64(len(pbad.data)) < int64(byteLen)+4 {
    64  			return i, xerrors.New("parquet: eof reading bytearray")
    65  		}
    66  
    67  		out[i] = pbad.data[4 : byteLen+4 : byteLen+4]
    68  		pbad.data = pbad.data[byteLen+4:]
    69  	}
    70  
    71  	pbad.nvals -= max
    72  	return max, nil
    73  }
    74  
    75  // DecodeSpaced is like Decode, but expands the slice out to leave empty values
    76  // where the validBits bitmap has 0s
    77  func (pbad *PlainByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
    78  	toRead := len(out) - nullCount
    79  	valuesRead, err := pbad.Decode(out[:toRead])
    80  	if err != nil {
    81  		return valuesRead, err
    82  	}
    83  	if valuesRead != toRead {
    84  		return valuesRead, xerrors.New("parquet: number of values / definition levels read did not match")
    85  	}
    86  
    87  	return spacedExpand(out, nullCount, validBits, validBitsOffset), nil
    88  }