github.com/apache/arrow/go/v10@v10.0.1/parquet/internal/encoding/byte_array_encoder.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "encoding/binary" 21 "unsafe" 22 23 "github.com/apache/arrow/go/v10/arrow" 24 "github.com/apache/arrow/go/v10/internal/bitutils" 25 "github.com/apache/arrow/go/v10/internal/utils" 26 "github.com/apache/arrow/go/v10/parquet" 27 ) 28 29 // PlainByteArrayEncoder encodes byte arrays according to the spec for Plain encoding 30 // by encoding the length as a int32 followed by the bytes of the value. 31 type PlainByteArrayEncoder struct { 32 encoder 33 34 bitSetReader bitutils.SetBitRunReader 35 } 36 37 // PutByteArray writes out the 4 bytes for the length followed by the data 38 func (enc *PlainByteArrayEncoder) PutByteArray(val parquet.ByteArray) { 39 inc := val.Len() + arrow.Uint32SizeBytes 40 enc.sink.Reserve(inc) 41 vlen := utils.ToLEUint32(uint32(val.Len())) 42 enc.sink.UnsafeWrite((*(*[4]byte)(unsafe.Pointer(&vlen)))[:]) 43 enc.sink.UnsafeWrite(val) 44 } 45 46 // Put writes out all of the values in this slice to the encoding sink 47 func (enc *PlainByteArrayEncoder) Put(in []parquet.ByteArray) { 48 for _, val := range in { 49 enc.PutByteArray(val) 50 } 51 } 52 53 // PutSpaced uses the bitmap of validBits to leave out anything that is null according 54 // to the bitmap. 55 // 56 // If validBits is nil, this is equivalent to calling Put 57 func (enc *PlainByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { 58 if validBits != nil { 59 if enc.bitSetReader == nil { 60 enc.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in))) 61 } else { 62 enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in))) 63 } 64 65 for { 66 run := enc.bitSetReader.NextRun() 67 if run.Length == 0 { 68 break 69 } 70 enc.Put(in[int(run.Pos):int(run.Pos+run.Length)]) 71 } 72 } else { 73 enc.Put(in) 74 } 75 } 76 77 // Type returns parquet.Types.ByteArray for the bytearray encoder 78 func (PlainByteArrayEncoder) Type() parquet.Type { 79 return parquet.Types.ByteArray 80 } 81 82 // WriteDict writes the dictionary out to the provided slice, out should be 83 // at least DictEncodedSize() bytes 84 func (enc *DictByteArrayEncoder) WriteDict(out []byte) { 85 enc.memo.(BinaryMemoTable).VisitValues(0, func(v []byte) { 86 binary.LittleEndian.PutUint32(out, uint32(len(v))) 87 out = out[arrow.Uint32SizeBytes:] 88 copy(out, v) 89 out = out[len(v):] 90 }) 91 } 92 93 // PutByteArray adds a single byte array to buffer, updating the dictionary 94 // and encoded size if it's a new value 95 func (enc *DictByteArrayEncoder) PutByteArray(in parquet.ByteArray) { 96 if in == nil { 97 in = empty[:] 98 } 99 memoIdx, found, err := enc.memo.GetOrInsert(in) 100 if err != nil { 101 panic(err) 102 } 103 if !found { 104 enc.dictEncodedSize += in.Len() + arrow.Uint32SizeBytes 105 } 106 enc.addIndex(memoIdx) 107 } 108 109 // Put takes a slice of ByteArrays to add and encode. 110 func (enc *DictByteArrayEncoder) Put(in []parquet.ByteArray) { 111 for _, val := range in { 112 enc.PutByteArray(val) 113 } 114 } 115 116 // PutSpaced like with the non-dict encoder leaves out the values where the validBits bitmap is 0 117 func (enc *DictByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { 118 bitutils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error { 119 for i := int64(0); i < length; i++ { 120 enc.PutByteArray(in[i+pos]) 121 } 122 return nil 123 }) 124 }