github.com/apache/arrow/go/v16@v16.1.0/parquet/internal/encoding/boolean_encoder.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding 18 19 import ( 20 "encoding/binary" 21 22 "github.com/apache/arrow/go/v16/arrow/bitutil" 23 "github.com/apache/arrow/go/v16/parquet" 24 "github.com/apache/arrow/go/v16/parquet/internal/debug" 25 "github.com/apache/arrow/go/v16/parquet/internal/utils" 26 ) 27 28 const ( 29 boolBufSize = 1024 30 boolsInBuf = boolBufSize * 8 31 ) 32 33 // PlainBooleanEncoder encodes bools as a bitmap as per the Plain Encoding 34 type PlainBooleanEncoder struct { 35 encoder 36 bitsBuffer []byte 37 wr utils.BitmapWriter 38 } 39 40 // Type for the PlainBooleanEncoder is parquet.Types.Boolean 41 func (PlainBooleanEncoder) Type() parquet.Type { 42 return parquet.Types.Boolean 43 } 44 45 // Put encodes the contents of in into the underlying data buffer. 46 func (enc *PlainBooleanEncoder) Put(in []bool) { 47 if enc.bitsBuffer == nil { 48 enc.bitsBuffer = make([]byte, boolBufSize) 49 } 50 if enc.wr == nil { 51 enc.wr = utils.NewBitmapWriter(enc.bitsBuffer, 0, boolsInBuf) 52 } 53 if len(in) == 0 { 54 return 55 } 56 57 n := enc.wr.AppendBools(in) 58 for n < len(in) { 59 enc.wr.Finish() 60 enc.append(enc.bitsBuffer) 61 enc.wr.Reset(0, boolsInBuf) 62 in = in[n:] 63 n = enc.wr.AppendBools(in) 64 } 65 } 66 67 // PutSpaced will use the validBits bitmap to determine which values are nulls 68 // and can be left out from the slice, and the encoded without those nulls. 69 func (enc *PlainBooleanEncoder) PutSpaced(in []bool, validBits []byte, validBitsOffset int64) { 70 bufferOut := make([]bool, len(in)) 71 nvalid := spacedCompress(in, bufferOut, validBits, validBitsOffset) 72 enc.Put(bufferOut[:nvalid]) 73 } 74 75 // EstimatedDataEncodedSize returns the current number of bytes that have 76 // been buffered so far 77 func (enc *PlainBooleanEncoder) EstimatedDataEncodedSize() int64 { 78 return int64(enc.sink.Len() + int(bitutil.BytesForBits(int64(enc.wr.Pos())))) 79 } 80 81 // FlushValues returns the buffered data, the responsibility is on the caller 82 // to release the buffer memory 83 func (enc *PlainBooleanEncoder) FlushValues() (Buffer, error) { 84 if enc.wr.Pos() > 0 { 85 toFlush := int(enc.wr.Pos()) 86 enc.append(enc.bitsBuffer[:bitutil.BytesForBits(int64(toFlush))]) 87 } 88 89 enc.wr.Reset(0, boolsInBuf) 90 91 return enc.sink.Finish(), nil 92 } 93 94 const rleLengthInBytes = 4 95 96 type RleBooleanEncoder struct { 97 encoder 98 99 bufferedValues []bool 100 } 101 102 func (RleBooleanEncoder) Type() parquet.Type { 103 return parquet.Types.Boolean 104 } 105 106 func (enc *RleBooleanEncoder) Put(in []bool) { 107 enc.bufferedValues = append(enc.bufferedValues, in...) 108 } 109 110 func (enc *RleBooleanEncoder) PutSpaced(in []bool, validBits []byte, validBitsOffset int64) { 111 bufferOut := make([]bool, len(in)) 112 nvalid := spacedCompress(in, bufferOut, validBits, validBitsOffset) 113 enc.Put(bufferOut[:nvalid]) 114 } 115 116 func (enc *RleBooleanEncoder) EstimatedDataEncodedSize() int64 { 117 return rleLengthInBytes + int64(enc.maxRleBufferSize()) 118 } 119 120 func (enc *RleBooleanEncoder) maxRleBufferSize() int { 121 return utils.MaxRLEBufferSize(1, len(enc.bufferedValues)) + 122 utils.MinRLEBufferSize(1) 123 } 124 125 func (enc *RleBooleanEncoder) FlushValues() (Buffer, error) { 126 rleBufferSizeMax := enc.maxRleBufferSize() 127 enc.sink.SetOffset(rleLengthInBytes) 128 enc.sink.Reserve(rleBufferSizeMax) 129 130 rleEncoder := utils.NewRleEncoder(enc.sink, 1) 131 for _, v := range enc.bufferedValues { 132 if v { 133 rleEncoder.Put(1) 134 } else { 135 rleEncoder.Put(0) 136 } 137 } 138 n := rleEncoder.Flush() 139 debug.Assert(n <= rleBufferSizeMax, "num encoded bytes larger than expected max") 140 buf := enc.sink.Finish() 141 binary.LittleEndian.PutUint32(buf.Bytes(), uint32(n)) 142 143 return buf, nil 144 }