github.com/apache/arrow/go/v14@v14.0.1/parquet/internal/encoding/byte_array_decoder.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"encoding/binary"
    21  
    22  	"github.com/apache/arrow/go/v14/arrow"
    23  	"github.com/apache/arrow/go/v14/arrow/array"
    24  	"github.com/apache/arrow/go/v14/arrow/memory"
    25  	"github.com/apache/arrow/go/v14/internal/utils"
    26  	"github.com/apache/arrow/go/v14/parquet"
    27  	pqutils "github.com/apache/arrow/go/v14/parquet/internal/utils"
    28  	"golang.org/x/xerrors"
    29  )
    30  
    31  // PlainByteArrayDecoder decodes a data chunk for bytearrays according to
    32  // the plain encoding. The byte arrays will use slices to reference the
    33  // data rather than copying it.
    34  //
    35  // The parquet spec defines Plain encoding for ByteArrays as a 4 byte little
    36  // endian integer containing the length of the bytearray followed by that many
    37  // bytes being the raw data of the byte array.
    38  type PlainByteArrayDecoder struct {
    39  	decoder
    40  }
    41  
    42  // Type returns parquet.Types.ByteArray for this decoder
    43  func (PlainByteArrayDecoder) Type() parquet.Type {
    44  	return parquet.Types.ByteArray
    45  }
    46  
    47  // Decode will populate the slice of bytearrays in full or until the number
    48  // of values is consumed.
    49  //
    50  // Returns the number of values that were decoded.
    51  func (pbad *PlainByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) {
    52  	max := utils.MinInt(len(out), pbad.nvals)
    53  
    54  	for i := 0; i < max; i++ {
    55  		// there should always be at least four bytes which is the length of the
    56  		// next value in the data.
    57  		if len(pbad.data) < 4 {
    58  			return i, xerrors.New("parquet: eof reading bytearray")
    59  		}
    60  
    61  		// the first 4 bytes are a little endian int32 length
    62  		byteLen := int32(binary.LittleEndian.Uint32(pbad.data[:4]))
    63  		if byteLen < 0 {
    64  			return i, xerrors.New("parquet: invalid BYTE_ARRAY value")
    65  		}
    66  
    67  		if int64(len(pbad.data)) < int64(byteLen)+4 {
    68  			return i, xerrors.New("parquet: eof reading bytearray")
    69  		}
    70  
    71  		out[i] = pbad.data[4 : byteLen+4 : byteLen+4]
    72  		pbad.data = pbad.data[byteLen+4:]
    73  	}
    74  
    75  	pbad.nvals -= max
    76  	return max, nil
    77  }
    78  
    79  // DecodeSpaced is like Decode, but expands the slice out to leave empty values
    80  // where the validBits bitmap has 0s
    81  func (pbad *PlainByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
    82  	toRead := len(out) - nullCount
    83  	valuesRead, err := pbad.Decode(out[:toRead])
    84  	if err != nil {
    85  		return valuesRead, err
    86  	}
    87  	if valuesRead != toRead {
    88  		return valuesRead, xerrors.New("parquet: number of values / definition levels read did not match")
    89  	}
    90  
    91  	return spacedExpand(out, nullCount, validBits, validBitsOffset), nil
    92  }
    93  
    94  func (d *DictByteArrayDecoder) InsertDictionary(bldr array.Builder) error {
    95  	conv := d.dictValueDecoder.(*ByteArrayDictConverter)
    96  	dictLength := cap(conv.dict)
    97  	conv.ensure(pqutils.IndexType(dictLength))
    98  
    99  	byteArrayData := memory.NewResizableBuffer(d.mem)
   100  	defer byteArrayData.Release()
   101  	byteArrayOffsets := memory.NewResizableBuffer(d.mem)
   102  	defer byteArrayOffsets.Release()
   103  
   104  	var totalLen int
   105  	for _, v := range conv.dict {
   106  		totalLen += len(v)
   107  	}
   108  	byteArrayData.ResizeNoShrink(totalLen)
   109  	byteArrayOffsets.ResizeNoShrink((dictLength + 1) * arrow.Int32SizeBytes)
   110  
   111  	byteData := byteArrayData.Bytes()
   112  	byteOffsets := arrow.Int32Traits.CastFromBytes(byteArrayOffsets.Bytes())
   113  
   114  	var offset int32
   115  	for i, v := range conv.dict {
   116  		n := copy(byteData, v)
   117  		byteData, byteOffsets[i] = byteData[n:], offset
   118  		offset += int32(n)
   119  	}
   120  	byteOffsets[dictLength] = offset
   121  
   122  	data := array.NewData(bldr.Type().(*arrow.DictionaryType).ValueType, dictLength,
   123  		[]*memory.Buffer{nil, byteArrayOffsets, byteArrayData}, nil, 0, 0)
   124  	defer data.Release()
   125  	arr := array.NewBinaryData(data)
   126  	defer arr.Release()
   127  
   128  	binaryBldr := bldr.(*array.BinaryDictionaryBuilder)
   129  	return binaryBldr.InsertDictValues(arr)
   130  }