github.com/apache/arrow/go/v16@v16.1.0/parquet/internal/encoding/delta_byte_array.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package encoding
    18  
    19  import (
    20  	"github.com/apache/arrow/go/v16/arrow/memory"
    21  	"github.com/apache/arrow/go/v16/internal/utils"
    22  	"github.com/apache/arrow/go/v16/parquet"
    23  	"golang.org/x/xerrors"
    24  )
    25  
    26  // DeltaByteArrayEncoder is an encoder for writing bytearrays which are delta encoded
    27  // this is also known as incremental encoding or front compression. For each element
    28  // in a sequence of strings, we store the prefix length of the previous entry plus the suffix
    29  // see https://en.wikipedia.org/wiki/Incremental_encoding for a longer description.
    30  //
    31  // This is stored as a sequence of delta-encoded prefix lengths followed by the suffixes
    32  // encoded as delta length byte arrays.
    33  type DeltaByteArrayEncoder struct {
    34  	encoder
    35  
    36  	prefixEncoder *DeltaBitPackInt32Encoder
    37  	suffixEncoder *DeltaLengthByteArrayEncoder
    38  
    39  	lastVal parquet.ByteArray
    40  }
    41  
    42  func (enc *DeltaByteArrayEncoder) EstimatedDataEncodedSize() int64 {
    43  	prefixEstimatedSize := int64(0)
    44  	if enc.prefixEncoder != nil {
    45  		prefixEstimatedSize = enc.prefixEncoder.EstimatedDataEncodedSize()
    46  	}
    47  	suffixEstimatedSize := int64(0)
    48  	if enc.suffixEncoder != nil {
    49  		suffixEstimatedSize = enc.suffixEncoder.EstimatedDataEncodedSize()
    50  	}
    51  	return prefixEstimatedSize + suffixEstimatedSize
    52  }
    53  
    54  func (enc *DeltaByteArrayEncoder) initEncoders() {
    55  	enc.prefixEncoder = &DeltaBitPackInt32Encoder{
    56  		deltaBitPackEncoder: &deltaBitPackEncoder{encoder: newEncoderBase(enc.encoding, nil, enc.mem)}}
    57  	enc.suffixEncoder = &DeltaLengthByteArrayEncoder{
    58  		newEncoderBase(enc.encoding, nil, enc.mem),
    59  		&DeltaBitPackInt32Encoder{
    60  			deltaBitPackEncoder: &deltaBitPackEncoder{encoder: newEncoderBase(enc.encoding, nil, enc.mem)}}}
    61  }
    62  
    63  // Type returns the underlying physical type this operates on, in this case ByteArrays only
    64  func (DeltaByteArrayEncoder) Type() parquet.Type { return parquet.Types.ByteArray }
    65  
    66  // Put writes a slice of ByteArrays to the encoder
    67  func (enc *DeltaByteArrayEncoder) Put(in []parquet.ByteArray) {
    68  	if len(in) == 0 {
    69  		return
    70  	}
    71  
    72  	var suf parquet.ByteArray
    73  	if enc.prefixEncoder == nil { // initialize our encoders if we haven't yet
    74  		enc.initEncoders()
    75  		enc.prefixEncoder.Put([]int32{0})
    76  		suf = in[0]
    77  		enc.lastVal = in[0]
    78  		enc.suffixEncoder.Put([]parquet.ByteArray{suf})
    79  		in = in[1:]
    80  	}
    81  
    82  	// for each value, figure out the common prefix with the previous value
    83  	// and then write the prefix length and the suffix.
    84  	for _, val := range in {
    85  		l1 := enc.lastVal.Len()
    86  		l2 := val.Len()
    87  		j := 0
    88  		for j < l1 && j < l2 {
    89  			if enc.lastVal[j] != val[j] {
    90  				break
    91  			}
    92  			j++
    93  		}
    94  		enc.prefixEncoder.Put([]int32{int32(j)})
    95  		suf = val[j:]
    96  		enc.suffixEncoder.Put([]parquet.ByteArray{suf})
    97  		enc.lastVal = val
    98  	}
    99  
   100  	// do the memcpy after the loops to keep a copy of the lastVal
   101  	// we do a copy here so that we only copy and keep a reference
   102  	// to the suffix, and aren't forcing the *entire* value to stay
   103  	// in memory while we have this reference to just the suffix.
   104  	enc.lastVal = append([]byte{}, enc.lastVal...)
   105  }
   106  
   107  // PutSpaced is like Put, but assumes the data is already spaced for nulls and uses the bitmap provided and offset
   108  // to compress the data before writing it without the null slots.
   109  func (enc *DeltaByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) {
   110  	if validBits != nil {
   111  		data := make([]parquet.ByteArray, len(in))
   112  		nvalid := spacedCompress(in, data, validBits, validBitsOffset)
   113  		enc.Put(data[:nvalid])
   114  	} else {
   115  		enc.Put(in)
   116  	}
   117  }
   118  
   119  // Flush flushes any remaining data out and returns the finished encoded buffer.
   120  // or returns nil and any error encountered during flushing.
   121  func (enc *DeltaByteArrayEncoder) FlushValues() (Buffer, error) {
   122  	if enc.prefixEncoder == nil {
   123  		enc.initEncoders()
   124  	}
   125  	prefixBuf, err := enc.prefixEncoder.FlushValues()
   126  	if err != nil {
   127  		return nil, err
   128  	}
   129  	defer prefixBuf.Release()
   130  
   131  	suffixBuf, err := enc.suffixEncoder.FlushValues()
   132  	if err != nil {
   133  		return nil, err
   134  	}
   135  	defer suffixBuf.Release()
   136  
   137  	ret := bufferPool.Get().(*memory.Buffer)
   138  	ret.ResizeNoShrink(prefixBuf.Len() + suffixBuf.Len())
   139  	copy(ret.Bytes(), prefixBuf.Bytes())
   140  	copy(ret.Bytes()[prefixBuf.Len():], suffixBuf.Bytes())
   141  	return poolBuffer{ret}, nil
   142  }
   143  
   144  // DeltaByteArrayDecoder is a decoder for a column of data encoded using incremental or prefix encoding.
   145  type DeltaByteArrayDecoder struct {
   146  	*DeltaLengthByteArrayDecoder
   147  
   148  	prefixLengths []int32
   149  	lastVal       parquet.ByteArray
   150  }
   151  
   152  // Type returns the underlying physical type this decoder operates on, in this case ByteArrays only
   153  func (DeltaByteArrayDecoder) Type() parquet.Type {
   154  	return parquet.Types.ByteArray
   155  }
   156  
   157  func (d *DeltaByteArrayDecoder) Allocator() memory.Allocator { return d.mem }
   158  
   159  // SetData expects the passed in data to be the prefix lengths, followed by the
   160  // blocks of suffix data in order to initialize the decoder.
   161  func (d *DeltaByteArrayDecoder) SetData(nvalues int, data []byte) error {
   162  	prefixLenDec := DeltaBitPackInt32Decoder{
   163  		deltaBitPackDecoder: &deltaBitPackDecoder{
   164  			decoder: newDecoderBase(d.encoding, d.descr),
   165  			mem:     d.mem}}
   166  
   167  	if err := prefixLenDec.SetData(nvalues, data); err != nil {
   168  		return err
   169  	}
   170  
   171  	d.prefixLengths = make([]int32, nvalues)
   172  	// decode all the prefix lengths first so we know how many bytes it took to get the
   173  	// prefix lengths for nvalues
   174  	prefixLenDec.Decode(d.prefixLengths)
   175  
   176  	// now that we know how many bytes we needed for the prefix lengths, the rest are the
   177  	// delta length byte array encoding.
   178  	return d.DeltaLengthByteArrayDecoder.SetData(nvalues, data[int(prefixLenDec.bytesRead()):])
   179  }
   180  
   181  // Decode decodes byte arrays into the slice provided and returns the number of values actually decoded
   182  func (d *DeltaByteArrayDecoder) Decode(out []parquet.ByteArray) (int, error) {
   183  	max := utils.Min(len(out), d.nvals)
   184  	if max == 0 {
   185  		return 0, nil
   186  	}
   187  	out = out[:max]
   188  
   189  	var err error
   190  	if d.lastVal == nil {
   191  		_, err = d.DeltaLengthByteArrayDecoder.Decode(out[:1])
   192  		if err != nil {
   193  			return 0, err
   194  		}
   195  		d.lastVal = out[0]
   196  		out = out[1:]
   197  		d.prefixLengths = d.prefixLengths[1:]
   198  	}
   199  
   200  	var prefixLen int32
   201  	suffixHolder := make([]parquet.ByteArray, 1)
   202  	for len(out) > 0 {
   203  		prefixLen, d.prefixLengths = d.prefixLengths[0], d.prefixLengths[1:]
   204  
   205  		prefix := d.lastVal[:prefixLen:prefixLen]
   206  		_, err = d.DeltaLengthByteArrayDecoder.Decode(suffixHolder)
   207  		if err != nil {
   208  			return 0, err
   209  		}
   210  
   211  		if len(suffixHolder[0]) == 0 {
   212  			d.lastVal = prefix
   213  		} else {
   214  			d.lastVal = make([]byte, int(prefixLen)+len(suffixHolder[0]))
   215  			copy(d.lastVal, prefix)
   216  			copy(d.lastVal[prefixLen:], suffixHolder[0])
   217  		}
   218  		out[0], out = d.lastVal, out[1:]
   219  	}
   220  	return max, nil
   221  }
   222  
   223  // DecodeSpaced is like decode, but the result is spaced out based on the bitmap provided.
   224  func (d *DeltaByteArrayDecoder) DecodeSpaced(out []parquet.ByteArray, nullCount int, validBits []byte, validBitsOffset int64) (int, error) {
   225  	toread := len(out) - nullCount
   226  	values, err := d.Decode(out[:toread])
   227  	if err != nil {
   228  		return values, err
   229  	}
   230  	if values != toread {
   231  		return values, xerrors.New("parquet: number of values / definition levels read did not match")
   232  	}
   233  
   234  	return spacedExpand(out, nullCount, validBits, validBitsOffset), nil
   235  }