github.com/apache/arrow/go/v14@v14.0.1/parquet/internal/encoding/fixed_len_byte_array_encoder.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"fmt"
    21  
    22  	"github.com/apache/arrow/go/v14/arrow"
    23  	"github.com/apache/arrow/go/v14/internal/bitutils"
    24  	"github.com/apache/arrow/go/v14/parquet"
    25  )
    26  
    27  // PlainFixedLenByteArrayEncoder writes the raw bytes of the byte array
    28  // always writing typeLength bytes for each value.
    29  type PlainFixedLenByteArrayEncoder struct {
    30  	encoder
    31  
    32  	bitSetReader bitutils.SetBitRunReader
    33  }
    34  
    35  // Put writes the provided values to the encoder
    36  func (enc *PlainFixedLenByteArrayEncoder) Put(in []parquet.FixedLenByteArray) {
    37  	typeLen := enc.descr.TypeLength()
    38  	if typeLen == 0 {
    39  		return
    40  	}
    41  
    42  	bytesNeeded := len(in) * typeLen
    43  	enc.sink.Reserve(bytesNeeded)
    44  	for _, val := range in {
    45  		if val == nil {
    46  			panic("value cannot be nil")
    47  		}
    48  		enc.sink.UnsafeWrite(val[:typeLen])
    49  	}
    50  }
    51  
    52  // PutSpaced is like Put but works with data that is spaced out according to the passed in bitmap
    53  func (enc *PlainFixedLenByteArrayEncoder) PutSpaced(in []parquet.FixedLenByteArray, validBits []byte, validBitsOffset int64) {
    54  	if validBits != nil {
    55  		if enc.bitSetReader == nil {
    56  			enc.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in)))
    57  		} else {
    58  			enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in)))
    59  		}
    60  
    61  		for {
    62  			run := enc.bitSetReader.NextRun()
    63  			if run.Length == 0 {
    64  				break
    65  			}
    66  			enc.Put(in[int(run.Pos):int(run.Pos+run.Length)])
    67  		}
    68  	} else {
    69  		enc.Put(in)
    70  	}
    71  }
    72  
    73  // Type returns the underlying physical type this encoder works with, Fixed Length byte arrays.
    74  func (PlainFixedLenByteArrayEncoder) Type() parquet.Type {
    75  	return parquet.Types.FixedLenByteArray
    76  }
    77  
    78  // WriteDict overrides the embedded WriteDict function to call a specialized function
    79  // for copying out the Fixed length values from the dictionary more efficiently.
    80  func (enc *DictFixedLenByteArrayEncoder) WriteDict(out []byte) {
    81  	enc.memo.(BinaryMemoTable).CopyFixedWidthValues(0, enc.typeLen, out)
    82  }
    83  
    84  // Put writes fixed length values to a dictionary encoded column
    85  func (enc *DictFixedLenByteArrayEncoder) Put(in []parquet.FixedLenByteArray) {
    86  	for _, v := range in {
    87  		memoIdx, found, err := enc.memo.GetOrInsert(v)
    88  		if err != nil {
    89  			panic(err)
    90  		}
    91  		if !found {
    92  			enc.dictEncodedSize += enc.typeLen
    93  		}
    94  		enc.addIndex(memoIdx)
    95  	}
    96  }
    97  
    98  // PutSpaced is like Put but leaves space for nulls
    99  func (enc *DictFixedLenByteArrayEncoder) PutSpaced(in []parquet.FixedLenByteArray, validBits []byte, validBitsOffset int64) {
   100  	bitutils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error {
   101  		enc.Put(in[pos : pos+length])
   102  		return nil
   103  	})
   104  }
   105  
   106  // PutDictionary allows pre-seeding a dictionary encoder with
   107  // a dictionary from an Arrow Array.
   108  //
   109  // The passed in array must not have any nulls and this can only
   110  // be called on an empty encoder.
   111  func (enc *DictFixedLenByteArrayEncoder) PutDictionary(values arrow.Array) error {
   112  	if values.DataType().ID() != arrow.FIXED_SIZE_BINARY && values.DataType().ID() != arrow.DECIMAL {
   113  		return fmt.Errorf("%w: only fixed size binary and decimal128 arrays are supported", arrow.ErrInvalid)
   114  	}
   115  
   116  	if values.DataType().(arrow.FixedWidthDataType).Bytes() != enc.typeLen {
   117  		return fmt.Errorf("%w: size mismatch: %s should have been %d wide",
   118  			arrow.ErrInvalid, values.DataType(), enc.typeLen)
   119  	}
   120  
   121  	if err := enc.canPutDictionary(values); err != nil {
   122  		return err
   123  	}
   124  
   125  	enc.dictEncodedSize += enc.typeLen * values.Len()
   126  	data := values.Data().Buffers()[1].Bytes()[values.Data().Offset()*enc.typeLen:]
   127  	for i := 0; i < values.Len(); i++ {
   128  		_, _, err := enc.memo.GetOrInsert(data[i*enc.typeLen : (i+1)*enc.typeLen])
   129  		if err != nil {
   130  			return err
   131  		}
   132  	}
   133  
   134  	values.Retain()
   135  	enc.preservedDict = values
   136  	return nil
   137  }