github.com/apache/arrow/go/v14@v14.0.1/parquet/internal/encoding/delta_length_byte_array.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"github.com/apache/arrow/go/v14/arrow/memory"
    21  	"github.com/apache/arrow/go/v14/internal/utils"
    22  	"github.com/apache/arrow/go/v14/parquet"
    23  	"golang.org/x/xerrors"
    24  )
    25  
    26  // DeltaLengthByteArrayEncoder encodes data using by taking all of the byte array lengths
    27  // and encoding them in front using delta encoding, followed by all of the binary data
    28  // concatenated back to back. The expected savings is from the cost of encoding the lengths
    29  // and possibly better compression in the data which will no longer be interleaved with the lengths.
    30  //
    31  // This encoding is always preferred over PLAIN for byte array columns where possible.
    32  //
    33  // For example, if the data was "Hello", "World", "Foobar", "ABCDEF" the encoded data would be:
    34  // DeltaEncoding(5, 5, 6, 6) "HelloWorldFoobarABCDEF"
    35  type DeltaLengthByteArrayEncoder struct {
    36  	encoder
    37  
    38  	lengthEncoder *DeltaBitPackInt32Encoder
    39  }
    40  
    41  // Put writes the provided slice of byte arrays to the encoder
    42  func (enc *DeltaLengthByteArrayEncoder) Put(in []parquet.ByteArray) {
    43  	lengths := make([]int32, len(in))
    44  	totalLen := int(0)
    45  	for idx, val := range in {
    46  		lengths[idx] = int32(val.Len())
    47  		totalLen += val.Len()
    48  	}
    49  
    50  	enc.lengthEncoder.Put(lengths)
    51  	enc.sink.Reserve(totalLen)
    52  	for _, val := range in {
    53  		enc.sink.UnsafeWrite(val)
    54  	}
    55  }
    56  
    57  // PutSpaced is like Put, but the data is spaced out according to the bitmap provided and is compressed
    58  // accordingly before it is written to drop the null data from the write.
    59  func (enc *DeltaLengthByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) {
    60  	if validBits != nil {
    61  		data := make([]parquet.ByteArray, len(in))
    62  		nvalid := spacedCompress(in, data, validBits, validBitsOffset)
    63  		enc.Put(data[:nvalid])
    64  	} else {
    65  		enc.Put(in)
    66  	}
    67  }
    68  
    69  // Type returns the underlying type which is handled by this encoder, ByteArrays only.
    70  func (DeltaLengthByteArrayEncoder) Type() parquet.Type {
    71  	return parquet.Types.ByteArray
    72  }
    73  
    74  // FlushValues flushes any remaining data and returns the final encoded buffer of data
    75  // or returns nil and any error encountered.
    76  func (enc *DeltaLengthByteArrayEncoder) FlushValues() (Buffer, error) {
    77  	ret, err := enc.lengthEncoder.FlushValues()
    78  	if err != nil {
    79  		return nil, err
    80  	}
    81  	defer ret.Release()
    82  
    83  	data := enc.sink.Finish()
    84  	defer data.Release()
    85  
    86  	output := bufferPool.Get().(*memory.Buffer)
    87  	output.ResizeNoShrink(ret.Len() + data.Len())
    88  	copy(output.Bytes(), ret.Bytes())
    89  	copy(output.Bytes()[ret.Len():], data.Bytes())
    90  	return poolBuffer{output}, nil
    91  }
    92  
    93  // DeltaLengthByteArrayDecoder is a decoder for handling data produced by the corresponding
    94  // encoder which expects delta packed lengths followed by the bytes of data.
    95  type DeltaLengthByteArrayDecoder struct {
    96  	decoder
    97  
    98  	mem     memory.Allocator
    99  	lengths []int32
   100  }
   101  
   102  // Type returns the underlying type which is handled by this encoder, ByteArrays only.
   103  func (DeltaLengthByteArrayDecoder) Type() parquet.Type {
   104  	return parquet.Types.ByteArray
   105  }
   106  
   107  func (d *DeltaLengthByteArrayDecoder) Allocator() memory.Allocator { return d.mem }
   108  
   109  // SetData sets in the expected data to the decoder which should be nvalues delta packed lengths
   110  // followed by the rest of the byte array data immediately after.
   111  func (d *DeltaLengthByteArrayDecoder) SetData(nvalues int, data []byte) error {
   112  	dec := DeltaBitPackInt32Decoder{
   113  		deltaBitPackDecoder: &deltaBitPackDecoder{
   114  			decoder: newDecoderBase(d.encoding, d.descr),
   115  			mem:     d.mem}}
   116  
   117  	if err := dec.SetData(nvalues, data); err != nil {
   118  		return err
   119  	}
   120  	d.lengths = make([]int32, dec.totalValues)
   121  	dec.Decode(d.lengths)
   122  
   123  	return d.decoder.SetData(nvalues, data[int(dec.bytesRead()):])
   124  }
   125  
   126  // Decode populates the passed in slice with data decoded until it hits the length of out
   127  // or runs out of values in the column to decode, then returns the number of values actually decoded.
   128  func (d *DeltaLengthByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) {
   129  	max := utils.MinInt(len(out), d.nvals)
   130  	for i := 0; i < max; i++ {
   131  		out[i] = d.data[:d.lengths[i]:d.lengths[i]]
   132  		d.data = d.data[d.lengths[i]:]
   133  	}
   134  	d.nvals -= max
   135  	d.lengths = d.lengths[max:]
   136  	return max, nil
   137  }
   138  
   139  // DecodeSpaced is like Decode, but for spaced data using the provided bitmap to determine where the nulls should be inserted.
   140  func (d *DeltaLengthByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   141  	toread := len(out) - nullCount
   142  	values, _ := d.Decode(out[:toread])
   143  	if values != toread {
   144  		return values, xerrors.New("parquet: number of values / definition levels read did not match")
   145  	}
   146  
   147  	return spacedExpand(out, nullCount, validBits, validBitsOffset), nil
   148  }