github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/encoding/byte_array_encoder.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "encoding/binary" 21 "unsafe" 22 23 "github.com/apache/arrow/go/v7/arrow" 24 "github.com/apache/arrow/go/v7/parquet" 25 "github.com/apache/arrow/go/v7/parquet/internal/utils" 26 ) 27 28 // PlainByteArrayEncoder encodes byte arrays according to the spec for Plain encoding 29 // by encoding the length as a int32 followed by the bytes of the value. 30 type PlainByteArrayEncoder struct { 31 encoder 32 33 bitSetReader utils.SetBitRunReader 34 } 35 36 // PutByteArray writes out the 4 bytes for the length followed by the data 37 func (enc *PlainByteArrayEncoder) PutByteArray(val parquet.ByteArray) { 38 inc := val.Len() + arrow.Uint32SizeBytes 39 enc.sink.Reserve(inc) 40 vlen := utils.ToLEUint32(uint32(val.Len())) 41 enc.sink.UnsafeWrite((*(*[4]byte)(unsafe.Pointer(&vlen)))[:]) 42 enc.sink.UnsafeWrite(val) 43 } 44 45 // Put writes out all of the values in this slice to the encoding sink 46 func (enc *PlainByteArrayEncoder) Put(in []parquet.ByteArray) { 47 for _, val := range in { 48 enc.PutByteArray(val) 49 } 50 } 51 52 // PutSpaced uses the bitmap of validBits to leave out anything that is null according 53 // to the bitmap. 54 // 55 // If validBits is nil, this is equivalent to calling Put 56 func (enc *PlainByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { 57 if validBits != nil { 58 if enc.bitSetReader == nil { 59 enc.bitSetReader = utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in))) 60 } else { 61 enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in))) 62 } 63 64 for { 65 run := enc.bitSetReader.NextRun() 66 if run.Length == 0 { 67 break 68 } 69 enc.Put(in[int(run.Pos):int(run.Pos+run.Length)]) 70 } 71 } else { 72 enc.Put(in) 73 } 74 } 75 76 // Type returns parquet.Types.ByteArray for the bytearray encoder 77 func (PlainByteArrayEncoder) Type() parquet.Type { 78 return parquet.Types.ByteArray 79 } 80 81 // WriteDict writes the dictionary out to the provided slice, out should be 82 // at least DictEncodedSize() bytes 83 func (enc *DictByteArrayEncoder) WriteDict(out []byte) { 84 enc.memo.(BinaryMemoTable).VisitValues(0, func(v []byte) { 85 binary.LittleEndian.PutUint32(out, uint32(len(v))) 86 out = out[arrow.Uint32SizeBytes:] 87 copy(out, v) 88 out = out[len(v):] 89 }) 90 } 91 92 // PutByteArray adds a single byte array to buffer, updating the dictionary 93 // and encoded size if it's a new value 94 func (enc *DictByteArrayEncoder) PutByteArray(in parquet.ByteArray) { 95 if in == nil { 96 in = empty[:] 97 } 98 memoIdx, found, err := enc.memo.GetOrInsert(in) 99 if err != nil { 100 panic(err) 101 } 102 if !found { 103 enc.dictEncodedSize += in.Len() + arrow.Uint32SizeBytes 104 } 105 enc.addIndex(memoIdx) 106 } 107 108 // Put takes a slice of ByteArrays to add and encode. 109 func (enc *DictByteArrayEncoder) Put(in []parquet.ByteArray) { 110 for _, val := range in { 111 enc.PutByteArray(val) 112 } 113 } 114 115 // PutSpaced like with the non-dict encoder leaves out the values where the validBits bitmap is 0 116 func (enc *DictByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { 117 utils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error { 118 for i := int64(0); i < length; i++ { 119 enc.PutByteArray(in[i+pos]) 120 } 121 return nil 122 }) 123 }