github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/encoding/byte_array_encoder.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "encoding/binary" 21 "fmt" 22 "unsafe" 23 24 "github.com/apache/arrow/go/v14/arrow" 25 "github.com/apache/arrow/go/v14/arrow/array" 26 "github.com/apache/arrow/go/v14/internal/bitutils" 27 "github.com/apache/arrow/go/v14/internal/utils" 28 "github.com/apache/arrow/go/v14/parquet" 29 ) 30 31 // PlainByteArrayEncoder encodes byte arrays according to the spec for Plain encoding 32 // by encoding the length as a int32 followed by the bytes of the value. 33 type PlainByteArrayEncoder struct { 34 encoder 35 36 bitSetReader bitutils.SetBitRunReader 37 } 38 39 // PutByteArray writes out the 4 bytes for the length followed by the data 40 func (enc *PlainByteArrayEncoder) PutByteArray(val parquet.ByteArray) { 41 inc := val.Len() + arrow.Uint32SizeBytes 42 enc.sink.Reserve(inc) 43 vlen := utils.ToLEUint32(uint32(val.Len())) 44 enc.sink.UnsafeWrite((*(*[4]byte)(unsafe.Pointer(&vlen)))[:]) 45 enc.sink.UnsafeWrite(val) 46 } 47 48 // Put writes out all of the values in this slice to the encoding sink 49 func (enc *PlainByteArrayEncoder) Put(in []parquet.ByteArray) { 50 for _, val := range in { 51 enc.PutByteArray(val) 52 } 53 } 54 55 // PutSpaced uses the bitmap of validBits to leave out anything that is null according 56 // to the bitmap. 57 // 58 // If validBits is nil, this is equivalent to calling Put 59 func (enc *PlainByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { 60 if validBits != nil { 61 if enc.bitSetReader == nil { 62 enc.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in))) 63 } else { 64 enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in))) 65 } 66 67 for { 68 run := enc.bitSetReader.NextRun() 69 if run.Length == 0 { 70 break 71 } 72 enc.Put(in[int(run.Pos):int(run.Pos+run.Length)]) 73 } 74 } else { 75 enc.Put(in) 76 } 77 } 78 79 // Type returns parquet.Types.ByteArray for the bytearray encoder 80 func (PlainByteArrayEncoder) Type() parquet.Type { 81 return parquet.Types.ByteArray 82 } 83 84 // WriteDict writes the dictionary out to the provided slice, out should be 85 // at least DictEncodedSize() bytes 86 func (enc *DictByteArrayEncoder) WriteDict(out []byte) { 87 enc.memo.(BinaryMemoTable).VisitValues(0, func(v []byte) { 88 binary.LittleEndian.PutUint32(out, uint32(len(v))) 89 out = out[arrow.Uint32SizeBytes:] 90 copy(out, v) 91 out = out[len(v):] 92 }) 93 } 94 95 // PutByteArray adds a single byte array to buffer, updating the dictionary 96 // and encoded size if it's a new value 97 func (enc *DictByteArrayEncoder) PutByteArray(in parquet.ByteArray) { 98 memoIdx, found, err := enc.memo.GetOrInsert(in) 99 if err != nil { 100 panic(err) 101 } 102 if !found { 103 enc.dictEncodedSize += in.Len() + arrow.Uint32SizeBytes 104 } 105 enc.addIndex(memoIdx) 106 } 107 108 // Put takes a slice of ByteArrays to add and encode. 109 func (enc *DictByteArrayEncoder) Put(in []parquet.ByteArray) { 110 for _, val := range in { 111 enc.PutByteArray(val) 112 } 113 } 114 115 // PutSpaced like with the non-dict encoder leaves out the values where the validBits bitmap is 0 116 func (enc *DictByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { 117 bitutils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error { 118 for i := int64(0); i < length; i++ { 119 enc.PutByteArray(in[i+pos]) 120 } 121 return nil 122 }) 123 } 124 125 // PutDictionary allows pre-seeding a dictionary encoder with 126 // a dictionary from an Arrow Array. 127 // 128 // The passed in array must not have any nulls and this can only 129 // be called on an empty encoder. 130 func (enc *DictByteArrayEncoder) PutDictionary(values arrow.Array) error { 131 if err := enc.canPutDictionary(values); err != nil { 132 return err 133 } 134 135 if !arrow.IsBaseBinary(values.DataType().ID()) { 136 return fmt.Errorf("%w: only binary and string arrays are supported", arrow.ErrInvalid) 137 } 138 139 arr := values.(array.BinaryLike) 140 data := arr.ValueBytes() 141 for i := 0; i < arr.Len(); i++ { 142 curOffset := arr.ValueOffset64(i) 143 var v []byte 144 if i == arr.Len()-1 { 145 v = data[curOffset:] 146 } else { 147 v = data[curOffset:arr.ValueOffset64(i+1)] 148 } 149 enc.dictEncodedSize += len(v) + arrow.Uint32SizeBytes 150 if _, _, err := enc.memo.GetOrInsert(v); err != nil { 151 return err 152 } 153 } 154 155 values.Retain() 156 enc.preservedDict = values 157 return nil 158 }