github.com/apache/arrow/go/v10@v10.0.1/parquet/internal/encoding/byte_array_encoder.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"encoding/binary"
    21  	"unsafe"
    22  
    23  	"github.com/apache/arrow/go/v10/arrow"
    24  	"github.com/apache/arrow/go/v10/internal/bitutils"
    25  	"github.com/apache/arrow/go/v10/internal/utils"
    26  	"github.com/apache/arrow/go/v10/parquet"
    27  )
    28  
    29  // PlainByteArrayEncoder encodes byte arrays according to the spec for Plain encoding
    30  // by encoding the length as a int32 followed by the bytes of the value.
    31  type PlainByteArrayEncoder struct {
    32  	encoder
    33  
    34  	bitSetReader bitutils.SetBitRunReader
    35  }
    36  
    37  // PutByteArray writes out the 4 bytes for the length followed by the data
    38  func (enc *PlainByteArrayEncoder) PutByteArray(val parquet.ByteArray) {
    39  	inc := val.Len() + arrow.Uint32SizeBytes
    40  	enc.sink.Reserve(inc)
    41  	vlen := utils.ToLEUint32(uint32(val.Len()))
    42  	enc.sink.UnsafeWrite((*(*[4]byte)(unsafe.Pointer(&vlen)))[:])
    43  	enc.sink.UnsafeWrite(val)
    44  }
    45  
    46  // Put writes out all of the values in this slice to the encoding sink
    47  func (enc *PlainByteArrayEncoder) Put(in []parquet.ByteArray) {
    48  	for _, val := range in {
    49  		enc.PutByteArray(val)
    50  	}
    51  }
    52  
    53  // PutSpaced uses the bitmap of validBits to leave out anything that is null according
    54  // to the bitmap.
    55  //
    56  // If validBits is nil, this is equivalent to calling Put
    57  func (enc *PlainByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) {
    58  	if validBits != nil {
    59  		if enc.bitSetReader == nil {
    60  			enc.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in)))
    61  		} else {
    62  			enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in)))
    63  		}
    64  
    65  		for {
    66  			run := enc.bitSetReader.NextRun()
    67  			if run.Length == 0 {
    68  				break
    69  			}
    70  			enc.Put(in[int(run.Pos):int(run.Pos+run.Length)])
    71  		}
    72  	} else {
    73  		enc.Put(in)
    74  	}
    75  }
    76  
    77  // Type returns parquet.Types.ByteArray for the bytearray encoder
    78  func (PlainByteArrayEncoder) Type() parquet.Type {
    79  	return parquet.Types.ByteArray
    80  }
    81  
    82  // WriteDict writes the dictionary out to the provided slice, out should be
    83  // at least DictEncodedSize() bytes
    84  func (enc *DictByteArrayEncoder) WriteDict(out []byte) {
    85  	enc.memo.(BinaryMemoTable).VisitValues(0, func(v []byte) {
    86  		binary.LittleEndian.PutUint32(out, uint32(len(v)))
    87  		out = out[arrow.Uint32SizeBytes:]
    88  		copy(out, v)
    89  		out = out[len(v):]
    90  	})
    91  }
    92  
    93  // PutByteArray adds a single byte array to buffer, updating the dictionary
    94  // and encoded size if it's a new value
    95  func (enc *DictByteArrayEncoder) PutByteArray(in parquet.ByteArray) {
    96  	if in == nil {
    97  		in = empty[:]
    98  	}
    99  	memoIdx, found, err := enc.memo.GetOrInsert(in)
   100  	if err != nil {
   101  		panic(err)
   102  	}
   103  	if !found {
   104  		enc.dictEncodedSize += in.Len() + arrow.Uint32SizeBytes
   105  	}
   106  	enc.addIndex(memoIdx)
   107  }
   108  
   109  // Put takes a slice of ByteArrays to add and encode.
   110  func (enc *DictByteArrayEncoder) Put(in []parquet.ByteArray) {
   111  	for _, val := range in {
   112  		enc.PutByteArray(val)
   113  	}
   114  }
   115  
   116  // PutSpaced like with the non-dict encoder leaves out the values where the validBits bitmap is 0
   117  func (enc *DictByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) {
   118  	bitutils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error {
   119  		for i := int64(0); i < length; i++ {
   120  			enc.PutByteArray(in[i+pos])
   121  		}
   122  		return nil
   123  	})
   124  }