github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/encoding/byte_array_encoder.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"encoding/binary"
    21  	"unsafe"
    22  
    23  	"github.com/apache/arrow/go/v7/arrow"
    24  	"github.com/apache/arrow/go/v7/parquet"
    25  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    26  )
    27  
    28  // PlainByteArrayEncoder encodes byte arrays according to the spec for Plain encoding
    29  // by encoding the length as a int32 followed by the bytes of the value.
    30  type PlainByteArrayEncoder struct {
    31  	encoder
    32  
    33  	bitSetReader utils.SetBitRunReader
    34  }
    35  
    36  // PutByteArray writes out the 4 bytes for the length followed by the data
    37  func (enc *PlainByteArrayEncoder) PutByteArray(val parquet.ByteArray) {
    38  	inc := val.Len() + arrow.Uint32SizeBytes
    39  	enc.sink.Reserve(inc)
    40  	vlen := utils.ToLEUint32(uint32(val.Len()))
    41  	enc.sink.UnsafeWrite((*(*[4]byte)(unsafe.Pointer(&vlen)))[:])
    42  	enc.sink.UnsafeWrite(val)
    43  }
    44  
    45  // Put writes out all of the values in this slice to the encoding sink
    46  func (enc *PlainByteArrayEncoder) Put(in []parquet.ByteArray) {
    47  	for _, val := range in {
    48  		enc.PutByteArray(val)
    49  	}
    50  }
    51  
    52  // PutSpaced uses the bitmap of validBits to leave out anything that is null according
    53  // to the bitmap.
    54  //
    55  // If validBits is nil, this is equivalent to calling Put
    56  func (enc *PlainByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) {
    57  	if validBits != nil {
    58  		if enc.bitSetReader == nil {
    59  			enc.bitSetReader = utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in)))
    60  		} else {
    61  			enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in)))
    62  		}
    63  
    64  		for {
    65  			run := enc.bitSetReader.NextRun()
    66  			if run.Length == 0 {
    67  				break
    68  			}
    69  			enc.Put(in[int(run.Pos):int(run.Pos+run.Length)])
    70  		}
    71  	} else {
    72  		enc.Put(in)
    73  	}
    74  }
    75  
    76  // Type returns parquet.Types.ByteArray for the bytearray encoder
    77  func (PlainByteArrayEncoder) Type() parquet.Type {
    78  	return parquet.Types.ByteArray
    79  }
    80  
    81  // WriteDict writes the dictionary out to the provided slice, out should be
    82  // at least DictEncodedSize() bytes
    83  func (enc *DictByteArrayEncoder) WriteDict(out []byte) {
    84  	enc.memo.(BinaryMemoTable).VisitValues(0, func(v []byte) {
    85  		binary.LittleEndian.PutUint32(out, uint32(len(v)))
    86  		out = out[arrow.Uint32SizeBytes:]
    87  		copy(out, v)
    88  		out = out[len(v):]
    89  	})
    90  }
    91  
    92  // PutByteArray adds a single byte array to buffer, updating the dictionary
    93  // and encoded size if it's a new value
    94  func (enc *DictByteArrayEncoder) PutByteArray(in parquet.ByteArray) {
    95  	if in == nil {
    96  		in = empty[:]
    97  	}
    98  	memoIdx, found, err := enc.memo.GetOrInsert(in)
    99  	if err != nil {
   100  		panic(err)
   101  	}
   102  	if !found {
   103  		enc.dictEncodedSize += in.Len() + arrow.Uint32SizeBytes
   104  	}
   105  	enc.addIndex(memoIdx)
   106  }
   107  
   108  // Put takes a slice of ByteArrays to add and encode.
   109  func (enc *DictByteArrayEncoder) Put(in []parquet.ByteArray) {
   110  	for _, val := range in {
   111  		enc.PutByteArray(val)
   112  	}
   113  }
   114  
   115  // PutSpaced like with the non-dict encoder leaves out the values where the validBits bitmap is 0
   116  func (enc *DictByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) {
   117  	utils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error {
   118  		for i := int64(0); i < length; i++ {
   119  			enc.PutByteArray(in[i+pos])
   120  		}
   121  		return nil
   122  	})
   123  }